Issue #22687: Fixed some corner cases in breaking words in tetxtwrap.

Got rid of quadratic complexity in breaking long words.
This commit is contained in:
Serhiy Storchaka 2015-03-24 18:32:27 +02:00
parent b365a06a84
commit 72bd327db0
3 changed files with 38 additions and 4 deletions

View file

@ -79,10 +79,25 @@ class TextWrapper:
# splits into
# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
# (after stripping out empty strings).
wordsep_re = re.compile(
r'(\s+|' # any whitespace
r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
word_punct = r'[\w!"\'&.,?]'
letter = r'[^\d\W]'
wordsep_re = re.compile(r'''
( # any whitespace
\s+
| # em-dash between words
(?<=%(wp)s) -{2,} (?=\w)
| # word, possibly hyphenated
\S+? (?:
# hyphenated word
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
(?= %(lt)s -? %(lt)s)
| # end of word
(?=\s|\Z)
| # em-dash
(?<=%(wp)s) (?=-{2,}\w)
)
)''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
del word_punct, letter
# This less funky little regex just split on recognized spaces. E.g.
# "Hello there -- you goof-ball, use the -b option!"