gh-130167: Optimise `textwrap.dedent()` (#131919)

Co-authored-by: Marius Juston <marius.juston@hotmail.fr>
Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
This commit is contained in:
Adam Turner 2025-03-31 01:35:12 +01:00 committed by GitHub
parent 685fd74f81
commit 6aa88a2cb3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 67 additions and 36 deletions

View file

@ -769,6 +769,56 @@ class DedentTestCase(unittest.TestCase):
"""assert that dedent() has no effect on 'text'"""
self.assertEqual(text, dedent(text))
def test_dedent_whitespace(self):
# The empty string.
text = ""
self.assertUnchanged(text)
# Only spaces.
text = " "
expect = ""
self.assertEqual(expect, dedent(text))
# Only tabs.
text = "\t\t\t\t"
expect = ""
self.assertEqual(expect, dedent(text))
# A mixture.
text = " \t \t\t \t "
expect = ""
self.assertEqual(expect, dedent(text))
# ASCII whitespace.
text = "\f\n\r\t\v "
expect = "\n"
self.assertEqual(expect, dedent(text))
# One newline.
text = "\n"
expect = "\n"
self.assertEqual(expect, dedent(text))
# Windows-style newlines.
text = "\r\n" * 5
expect = "\n" * 5
self.assertEqual(expect, dedent(text))
# Whitespace mixture.
text = " \n\t\n \n\t\t\n\n\n "
expect = "\n\n\n\n\n\n"
self.assertEqual(expect, dedent(text))
# Lines consisting only of whitespace are always normalised
text = "a\n \n\t\n"
expect = "a\n\n\n"
self.assertEqual(expect, dedent(text))
# Whitespace characters on non-empty lines are retained
text = "a\r\n\r\n\r\n"
expect = "a\r\n\n\n"
self.assertEqual(expect, dedent(text))
def test_dedent_nomargin(self):
# No lines indented.
text = "Hello there.\nHow are you?\nOh good, I'm glad."

View file

@ -413,9 +413,6 @@ def shorten(text, width, **kwargs):
# -- Loosely related functionality -------------------------------------
_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
def dedent(text):
"""Remove any common leading whitespace from every line in `text`.
@ -429,42 +426,21 @@ def dedent(text):
Entirely blank lines are normalized to a newline character.
"""
# Look for the longest leading string of spaces and tabs common to
# all lines.
margin = None
text = _whitespace_only_re.sub('', text)
indents = _leading_whitespace_re.findall(text)
for indent in indents:
if margin is None:
margin = indent
if not text:
return text
# Current line more deeply indented than previous winner:
# no change (previous winner is still on top).
elif indent.startswith(margin):
pass
lines = text.split('\n')
# Current line consistent with and no deeper than previous winner:
# it's the new winner.
elif margin.startswith(indent):
margin = indent
# Get length of leading whitespace, inspired by ``os.path.commonprefix()``.
non_blank_lines = [l for l in lines if l and not l.isspace()]
l1 = min(non_blank_lines, default='')
l2 = max(non_blank_lines, default='')
margin = 0
for margin, c in enumerate(l1):
if c != l2[margin] or c not in ' \t':
break
# Find the largest common whitespace between current line and previous
# winner.
else:
for i, (x, y) in enumerate(zip(margin, indent)):
if x != y:
margin = margin[:i]
break
# sanity check (testing/debugging only)
if 0 and margin:
for line in text.split("\n"):
assert not line or line.startswith(margin), \
"line = %r, margin = %r" % (line, margin)
if margin:
text = re.sub(r'(?m)^' + margin, '', text)
return text
return '\n'.join([l[margin:] if not l.isspace() else '' for l in lines])
def indent(text, prefix, predicate=None):

View file

@ -0,0 +1,5 @@
Improved performance of :func:`textwrap.dedent` by an average of ~2.4x,
(with improvements of up to 4x for large inputs),
and fixed a bug where blank lines with whitespace characters other than space
or horizontal tab were not normalised to the newline.
Patch by Adam Turner, Marius Juston, and Pieter Eendebak.