mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 03:44:55 +00:00 
			
		
		
		
	#10713: Improve documentation for \b and \B and add a few tests. Initial patch and tests by Martin Pool.
This commit is contained in:
		
							parent
							
								
									0555cde98d
								
							
						
					
					
						commit
						38ae5b2392
					
				
					 2 changed files with 36 additions and 5 deletions
				
			
		| 
						 | 
					@ -325,14 +325,19 @@ the second character.  For example, ``\$`` matches the character ``'$'``.
 | 
				
			||||||
   Matches the empty string, but only at the beginning or end of a word.  A word is
 | 
					   Matches the empty string, but only at the beginning or end of a word.  A word is
 | 
				
			||||||
   defined as a sequence of alphanumeric or underscore characters, so the end of a
 | 
					   defined as a sequence of alphanumeric or underscore characters, so the end of a
 | 
				
			||||||
   word is indicated by whitespace or a non-alphanumeric, non-underscore character.
 | 
					   word is indicated by whitespace or a non-alphanumeric, non-underscore character.
 | 
				
			||||||
   Note that  ``\b`` is defined as the boundary between ``\w`` and ``\W``, so the
 | 
					   Note that formally, ``\b`` is defined as the boundary between a ``\w`` and
 | 
				
			||||||
   precise set of characters deemed to be alphanumeric depends on the values of the
 | 
					   a ``\W`` character (or vice versa), or between ``\w`` and the beginning/end
 | 
				
			||||||
   ``UNICODE`` and ``LOCALE`` flags.  Inside a character range, ``\b`` represents
 | 
					   of the string, so the precise set of characters deemed to be alphanumeric
 | 
				
			||||||
   the backspace character, for compatibility with Python's string literals.
 | 
					   depends on the values of the ``UNICODE`` and ``LOCALE`` flags.
 | 
				
			||||||
 | 
					   For example, ``r'\bfoo\b'`` matches ``'foo'``, ``'foo.'``, ``'(foo)'``,
 | 
				
			||||||
 | 
					   ``'bar foo baz'`` but not ``'foobar'`` or ``'foo3'``.
 | 
				
			||||||
 | 
					   Inside a character range, ``\b`` represents the backspace character, for compatibility with Python's string literals.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``\B``
 | 
					``\B``
 | 
				
			||||||
   Matches the empty string, but only when it is *not* at the beginning or end of a
 | 
					   Matches the empty string, but only when it is *not* at the beginning or end of a
 | 
				
			||||||
   word.  This is just the opposite of ``\b``, so is also subject to the settings
 | 
					   word.  This means that ``r'py\B'`` matches ``'python'``, ``'py3'``, ``'py2'``,
 | 
				
			||||||
 | 
					   but not ``'py'``, ``'py.'``, or ``'py!'``.
 | 
				
			||||||
 | 
					   ``\B`` is just the opposite of ``\b``, so is also subject to the settings
 | 
				
			||||||
   of ``LOCALE`` and ``UNICODE``.
 | 
					   of ``LOCALE`` and ``UNICODE``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``\d``
 | 
					``\d``
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -373,6 +373,32 @@ class ReTests(unittest.TestCase):
 | 
				
			||||||
        self.assertEqual(re.search(r"\d\D\w\W\s\S",
 | 
					        self.assertEqual(re.search(r"\d\D\w\W\s\S",
 | 
				
			||||||
                                   "1aa! a", re.UNICODE).group(0), "1aa! a")
 | 
					                                   "1aa! a", re.UNICODE).group(0), "1aa! a")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_string_boundaries(self):
 | 
				
			||||||
 | 
					        # See http://bugs.python.org/issue10713
 | 
				
			||||||
 | 
					        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
 | 
				
			||||||
 | 
					                         "abc")
 | 
				
			||||||
 | 
					        # There's a word boundary at the start of a string.
 | 
				
			||||||
 | 
					        self.assertTrue(re.match(r"\b", "abc"))
 | 
				
			||||||
 | 
					        # A non-empty string includes a non-boundary zero-length match.
 | 
				
			||||||
 | 
					        self.assertTrue(re.search(r"\B", "abc"))
 | 
				
			||||||
 | 
					        # There is no non-boundary match at the start of a string.
 | 
				
			||||||
 | 
					        self.assertFalse(re.match(r"\B", "abc"))
 | 
				
			||||||
 | 
					        # However, an empty string contains no word boundaries, and also no
 | 
				
			||||||
 | 
					        # non-boundaries.
 | 
				
			||||||
 | 
					        self.assertEqual(re.search(r"\B", ""), None)
 | 
				
			||||||
 | 
					        # This one is questionable and different from the perlre behaviour,
 | 
				
			||||||
 | 
					        # but describes current behavior.
 | 
				
			||||||
 | 
					        self.assertEqual(re.search(r"\b", ""), None)
 | 
				
			||||||
 | 
					        # A single word-character string has two boundaries, but no
 | 
				
			||||||
 | 
					        # non-boundary gaps.
 | 
				
			||||||
 | 
					        self.assertEqual(len(re.findall(r"\b", "a")), 2)
 | 
				
			||||||
 | 
					        self.assertEqual(len(re.findall(r"\B", "a")), 0)
 | 
				
			||||||
 | 
					        # If there are no words, there are no boundaries
 | 
				
			||||||
 | 
					        self.assertEqual(len(re.findall(r"\b", " ")), 0)
 | 
				
			||||||
 | 
					        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
 | 
				
			||||||
 | 
					        # Can match around the whitespace.
 | 
				
			||||||
 | 
					        self.assertEqual(len(re.findall(r"\B", " ")), 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_bigcharset(self):
 | 
					    def test_bigcharset(self):
 | 
				
			||||||
        self.assertEqual(re.match(u"([\u2222\u2223])",
 | 
					        self.assertEqual(re.match(u"([\u2222\u2223])",
 | 
				
			||||||
                                  u"\u2222").group(1), u"\u2222")
 | 
					                                  u"\u2222").group(1), u"\u2222")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue