#23144: Make sure that HTMLParser.feed() returns all the data, even when convert_charrefs is True.

2025-11-25 04:34:37 +00:00 · 2015-09-06 21:38:06 +03:00 · 2015-09-06 21:38:06 +03:00 · 6f2bb98966
commit 6f2bb98966
parent 527ef0792f
3 changed files with 25 additions and 5 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -198,7 +198,15 @@ class HTMLParser(_markupbase.ParserBase):
            if self.convert_charrefs and not self.cdata_elem:
                j = rawdata.find('<', i)
                if j < 0:
-                    if not end:
+                    # if we can't find the next <, either we are at the end
+                    # or there's more text incoming.  If the latter is True,
+                    # we can't pass the text to handle_data in case we have
+                    # a charref cut in half at end.  Try to determine if
+                    # this is the case before proceding by looking for an
+                    # & near the end and see if it's followed by a space or ;.
+                    amppos = rawdata.rfind('&', max(i, n-34))
+                    if (amppos >= 0 and
+                        not re.compile(r'[\s;]').search(rawdata, amppos)):
                        break  # wait till we get all the text
                    j = n
            else: