Update article html by inserting missing text in order

2025-12-23 05:36:50 +00:00 · 2021-05-14 11:39:09 +12:00 · 2021-05-14 11:39:09 +12:00 · 27caf644cd
commit 27caf644cd
parent bc831c2ab0
2 changed files with 81 additions and 11 deletions
--- a/newspaper/extractors.py
+++ b/newspaper/extractors.py
@ -1036,7 +1036,7 @@ class ContentExtractor(object):
        on like paragraphs and tables
        """
        nodes_to_check = []
-        for tag in ['p', 'pre', 'strong', 'td']:
+        for tag in ['p', 'pre', 'td']:
            items = self.parser.getElementsByTag(doc, tag=tag)
            nodes_to_check += items
        return nodes_to_check
--- a/newspaper/outputformatters.py
+++ b/newspaper/outputformatters.py
@ -7,9 +7,10 @@ __author__ = 'Lucas Ou-Yang'
 __license__ = 'MIT'
 __copyright__ = 'Copyright 2014, Lucas Ou-Yang'

-from html import unescape
 import logging
+import re

+from html import unescape
 from .text import innerTrim


@ -36,6 +37,71 @@ def _update_text_list(txts, to_add, index=None):
        txts.extend(to_add)


+def insert_missing_html(html_idx, text_found, pre_parsed_html, parsed_html, node_text, html_to_update):
+    """A method that updates html by checking if pre_parsed_html (or parsed_html) which represents node_text should be
+    inserted into html_to_update. The method then returns the updated html and a new html-index being the position
+    in html_to_update after the insertion."""
+    # Begin by assuming we need to update the html with the given pre_parsed_html
+    update_html = True
+    # If the pre-parsed html already exists in the html...
+    if pre_parsed_html in html_to_update:
+        # then update the html-index to be its position in the html and flag we do not need to update the html
+        html_idx = html_to_update.index(pre_parsed_html)
+        update_html = False
+    # If the parsed html for this node already exists in the html...
+    elif parsed_html in html_to_update:
+        # then update the html-index to be its position in the html and continue loop to avoid increment
+        html_idx = html_to_update.index(parsed_html) + len(parsed_html)
+        logging.warning('Text in article html \'' + node_text[:10] + '...\' is missing hyperlink(s).')
+        return html_to_update, html_idx
+    # Else if the html is not there but its text was found previously when updating the article text...
+    elif text_found:
+        # attempt to find a variation of the text within the html
+        # Make a copy of the text and flag all spaces in between the words to be '.*' (any characters)
+        text_copy = innerTrim(node_text[:])
+        # Replace any characters that might trigger errors (html tags may be in between word and .; double hyphen can
+        # cause matching issues)
+        text_copy = text_copy.replace('—', ' ').replace('.', '')
+        text_copy_r = text_copy.replace(' ', '.*')
+        # Search for this combination of words (the node's text) in the html (re.S to consider \n)
+        html_match = re.search(text_copy_r, html_to_update, re.S)
+        # If we found a match...
+        if html_match:
+            # Obtain the start of this match
+            match_start = html_match.span()[0]
+            # Search for the first opening tag after the last word in node_text. Not guaranteed to acknowledge
+            # embedded html tags within the found html representing node_text but can ensure the sentence will
+            # not be split (e.g. in case of an anchor tag representing a hyperlink in the middle of a sentence).
+            words = text_copy.split()  # Obtain the words of the node's text
+            # The starting position to search where the last word was found from where the match began (i.e.) not
+            # within the whole string.
+            # Add match_start to get index based off of html_to_update and not in relation to the substring
+            # Return index value as is if we can't find the end of node_text's sentence in existing html
+            try:
+                start_search = html_to_update[match_start:].index(words.pop()) + match_start
+            except ValueError:
+                logging.warning('Could not locate position of text \'' + node_text[:10]
+                                + '...\' in article html; output html may be out of order.')
+                return html_to_update, html_idx
+            open_tag_match = re.search(r'<[^/\n].*>', html_to_update[start_search:], re.S)
+            # If an open tag was found, update html-index to to be its position in html_to_update
+            if open_tag_match:
+                html_idx = open_tag_match.span()[0] + start_search
+            # If no more open tags occur after this node's text in the html, make html-index point to the
+            # end of html_to_update so future nodes are added in order after this node_text
+            else:
+                html_idx = len(html_to_update)
+            # Return index as is because we do not need to update html and do not want html_idx to increment
+            return html_to_update, html_idx
+    # If we are updating the html
+    if update_html:
+        # Tidy up this node's html before inserting before and after the specified html-index
+        pre_parsed_html = innerTrim(pre_parsed_html) + '\n'
+        html_to_update = html_to_update[:html_idx] + pre_parsed_html + html_to_update[html_idx:]
+    # Increment html_idx by the length of characters for this node's html
+    return html_to_update, html_idx + len(pre_parsed_html)
+
+
 class OutputFormatter(object):

    def __init__(self, config, extractor):
@ -98,8 +164,8 @@ class OutputFormatter(object):
    def add_missing_text(self, txts, extra_nodes, html_to_update):
        """A method to return (text, html) given the current text and html so far (txts list and html_to_update).
        The method uses extra_nodes to consider any text that needs to be added before returning final text and html."""
-        # Keep track of the current index we are on
-        current_idx = 0
+        # Keep track of the current index we are on for the text and html
+        current_idx, html_idx = 0, 0
        # For each additional node we have...
        for extra in extra_nodes:
            # Ignore non-text nodes or nodes with a high link density
@ -110,8 +176,13 @@ class OutputFormatter(object):
            txt_count = len(stripped_txts)
            # Check the text is not already within the final txts list
            match = set(stripped_txts).intersection(txts)
-            # If it is already in the txts list, update current_idx to be where the node's text is + 1
-            if len(match):
+            node_found = bool(len(match))
+            # In regards to the html, convert to html and then parse any hyperlinks
+            pre_parsed_html = self.convert_to_html(extra)
+            self.parser.stripTags(extra, 'a')
+            parsed_html = self.convert_to_html(extra)
+            # If the text is already in the txts list, update current_idx to be where the node's text is + 1
+            if node_found:
                # In case of multiple entries for this node's text, gather all indices of the text in txts and
                # find the max (latest) entry
                found_idxs = []
@ -120,14 +191,13 @@ class OutputFormatter(object):
                current_idx = max(found_idxs) + 1
            # If the current node's text has not been added to the final txts list
            else:
-                # Parse any hyperlinks and include in final text
-                self.parser.stripTags(extra, 'a')
                _update_text_list(txts, _prepare_txt(extra.text), index=current_idx)
-                # Given this node is added to the text, add its contents to the html if it should be updated
-                if self.config.keep_article_html:
-                    html_to_update += self.convert_to_html(extra)
                # Update current_idx to be incremented by how many entries were added to txts
                current_idx += txt_count
+            # Update the html if it should be updated
+            if self.config.keep_article_html:
+                html_to_update, html_idx = insert_missing_html(html_idx, node_found, pre_parsed_html,
+                                                               parsed_html, extra.text, html_to_update)
        # Return final string based on txts list
        return '\n\n'.join(txts), html_to_update