diff --git a/newspaper/extractors.py b/newspaper/extractors.py index e6ddf98..a5f966b 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -1036,7 +1036,7 @@ class ContentExtractor(object): on like paragraphs and tables """ nodes_to_check = [] - for tag in ['p', 'pre', 'strong', 'td']: + for tag in ['p', 'pre', 'td']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items return nodes_to_check diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py index 6d4c9fc..deb5067 100644 --- a/newspaper/outputformatters.py +++ b/newspaper/outputformatters.py @@ -7,9 +7,10 @@ __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' -from html import unescape import logging +import re +from html import unescape from .text import innerTrim @@ -36,6 +37,71 @@ def _update_text_list(txts, to_add, index=None): txts.extend(to_add) +def insert_missing_html(html_idx, text_found, pre_parsed_html, parsed_html, node_text, html_to_update): + """A method that updates html by checking if pre_parsed_html (or parsed_html) which represents node_text should be + inserted into html_to_update. The method then returns the updated html and a new html-index being the position + in html_to_update after the insertion.""" + # Begin by assuming we need to update the html with the given pre_parsed_html + update_html = True + # If the pre-parsed html already exists in the html... + if pre_parsed_html in html_to_update: + # then update the html-index to be its position in the html and flag we do not need to update the html + html_idx = html_to_update.index(pre_parsed_html) + update_html = False + # If the parsed html for this node already exists in the html... + elif parsed_html in html_to_update: + # then update the html-index to be its position in the html and continue loop to avoid increment + html_idx = html_to_update.index(parsed_html) + len(parsed_html) + logging.warning('Text in article html \'' + node_text[:10] + '...\' is missing hyperlink(s).') + return html_to_update, html_idx + # Else if the html is not there but its text was found previously when updating the article text... + elif text_found: + # attempt to find a variation of the text within the html + # Make a copy of the text and flag all spaces in between the words to be '.*' (any characters) + text_copy = innerTrim(node_text[:]) + # Replace any characters that might trigger errors (html tags may be in between word and .; double hyphen can + # cause matching issues) + text_copy = text_copy.replace('—', ' ').replace('.', '') + text_copy_r = text_copy.replace(' ', '.*') + # Search for this combination of words (the node's text) in the html (re.S to consider \n) + html_match = re.search(text_copy_r, html_to_update, re.S) + # If we found a match... + if html_match: + # Obtain the start of this match + match_start = html_match.span()[0] + # Search for the first opening tag after the last word in node_text. Not guaranteed to acknowledge + # embedded html tags within the found html representing node_text but can ensure the sentence will + # not be split (e.g. in case of an anchor tag representing a hyperlink in the middle of a sentence). + words = text_copy.split() # Obtain the words of the node's text + # The starting position to search where the last word was found from where the match began (i.e.) not + # within the whole string. + # Add match_start to get index based off of html_to_update and not in relation to the substring + # Return index value as is if we can't find the end of node_text's sentence in existing html + try: + start_search = html_to_update[match_start:].index(words.pop()) + match_start + except ValueError: + logging.warning('Could not locate position of text \'' + node_text[:10] + + '...\' in article html; output html may be out of order.') + return html_to_update, html_idx + open_tag_match = re.search(r'<[^/\n].*>', html_to_update[start_search:], re.S) + # If an open tag was found, update html-index to to be its position in html_to_update + if open_tag_match: + html_idx = open_tag_match.span()[0] + start_search + # If no more open tags occur after this node's text in the html, make html-index point to the + # end of html_to_update so future nodes are added in order after this node_text + else: + html_idx = len(html_to_update) + # Return index as is because we do not need to update html and do not want html_idx to increment + return html_to_update, html_idx + # If we are updating the html + if update_html: + # Tidy up this node's html before inserting before and after the specified html-index + pre_parsed_html = innerTrim(pre_parsed_html) + '\n' + html_to_update = html_to_update[:html_idx] + pre_parsed_html + html_to_update[html_idx:] + # Increment html_idx by the length of characters for this node's html + return html_to_update, html_idx + len(pre_parsed_html) + + class OutputFormatter(object): def __init__(self, config, extractor): @@ -98,8 +164,8 @@ class OutputFormatter(object): def add_missing_text(self, txts, extra_nodes, html_to_update): """A method to return (text, html) given the current text and html so far (txts list and html_to_update). The method uses extra_nodes to consider any text that needs to be added before returning final text and html.""" - # Keep track of the current index we are on - current_idx = 0 + # Keep track of the current index we are on for the text and html + current_idx, html_idx = 0, 0 # For each additional node we have... for extra in extra_nodes: # Ignore non-text nodes or nodes with a high link density @@ -110,8 +176,13 @@ class OutputFormatter(object): txt_count = len(stripped_txts) # Check the text is not already within the final txts list match = set(stripped_txts).intersection(txts) - # If it is already in the txts list, update current_idx to be where the node's text is + 1 - if len(match): + node_found = bool(len(match)) + # In regards to the html, convert to html and then parse any hyperlinks + pre_parsed_html = self.convert_to_html(extra) + self.parser.stripTags(extra, 'a') + parsed_html = self.convert_to_html(extra) + # If the text is already in the txts list, update current_idx to be where the node's text is + 1 + if node_found: # In case of multiple entries for this node's text, gather all indices of the text in txts and # find the max (latest) entry found_idxs = [] @@ -120,14 +191,13 @@ class OutputFormatter(object): current_idx = max(found_idxs) + 1 # If the current node's text has not been added to the final txts list else: - # Parse any hyperlinks and include in final text - self.parser.stripTags(extra, 'a') _update_text_list(txts, _prepare_txt(extra.text), index=current_idx) - # Given this node is added to the text, add its contents to the html if it should be updated - if self.config.keep_article_html: - html_to_update += self.convert_to_html(extra) # Update current_idx to be incremented by how many entries were added to txts current_idx += txt_count + # Update the html if it should be updated + if self.config.keep_article_html: + html_to_update, html_idx = insert_missing_html(html_idx, node_found, pre_parsed_html, + parsed_html, extra.text, html_to_update) # Return final string based on txts list return '\n\n'.join(txts), html_to_update