From dd839cedfd9aac1163ce49c6ae275aaa6ab7c27b Mon Sep 17 00:00:00 2001 From: Adam Nelson Date: Thu, 7 Jul 2016 14:11:05 -0400 Subject: [PATCH 1/2] Reimplement parse_feeds() to grab titles of feeds --- newspaper/configuration.py | 4 ++-- newspaper/extractors.py | 4 ++-- newspaper/source.py | 26 +++++++++++--------------- requirements.txt | 1 - 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/newspaper/configuration.py b/newspaper/configuration.py index f93a532..f49c4ce 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -25,7 +25,7 @@ class Configuration(object): def __init__(self): """ Modify any of these Article / Source properties - TODO: Have a seperate ArticleConfig and SourceConfig extend this! + TODO: Have a separate ArticleConfig and SourceConfig extend this! """ self.MIN_WORD_COUNT = 300 # num of word tokens in text self.MIN_SENT_COUNT = 7 # num of sentence tokens @@ -55,7 +55,7 @@ class Configuration(object): # You may keep the html of just the main article body self.keep_article_html = False - # Fail for error respones (e.g. 404 page) + # Fail for error responses (e.g. 404 page) self.http_success_only = True # English is the fallback diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 0762905..f2cb14d 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -229,7 +229,7 @@ class ContentExtractor(object): - title tag is the most reliable (inherited from Goose) - h1, if properly detected, is the best (visible to users) - og:title and h1 can help improve the title extraction - - python == is too strict, often we need to compare fitlered + - python == is too strict, often we need to compare filtered versions, i.e. lowercase and ignoring special chars Explicit rules: @@ -251,7 +251,7 @@ class ContentExtractor(object): # title from h1 # - extract the longest text from all h1 elements - # - too short texts (less than 2 words) are discarded + # - too short texts (fewer than 2 words) are discarded # - clean double spaces title_text_h1 = '' title_element_h1_list = self.parser.getElementsByTag(doc, tag='h1') or [] diff --git a/newspaper/source.py b/newspaper/source.py index adac3e6..ef0cbf5 100644 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -98,7 +98,7 @@ class Source(object): self.set_feeds() self.download_feeds() # mthread - # TODO: self.parse_feeds() # regex for now + # self.parse_feeds() self.generate_articles() @@ -204,21 +204,17 @@ class Source(object): self.categories = [c for c in self.categories if c.doc is not None] - def parse_feeds(self): - """DEPRECATED - Due to the slow speed of feedparser, we won't be dom parsing - our .rss feeds, but rather regex searching for urls in the .rss - text and then relying on our article logic to detect false urls. - """ - for feed in self.feeds: - try: - feed.dom = feedparser.parse(feed.html) - except Exception as e: - log.critical('feedparser failed %s' % e) - if self.config.verbose: - print('feed %s has failed parsing' % feed.url) + def _map_title_to_feed(self,feed): + doc = self.config.get_parser().fromstring(feed.rss) + feed.title = self.config.get_parser().getElementsByTag(doc, tag='title')[0].text or '' + return feed - self.feeds = [feed for feed in self.feeds if feed.dom is not None] + def parse_feeds(self): + """Add titles to feeds + """ + log.debug('We are parsing %d feeds' % + len(self.feeds)) + self.feeds = [self._map_title_to_feed(f) for f in self.feeds] def feeds_to_articles(self): """Returns articles given the url of a feed diff --git a/requirements.txt b/requirements.txt index 588f5b5..b974150 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ cssselect>=0.9.1 lxml>=3.3.5 nltk>=3.2 requests>=2.3.0 -feedparser>=5.1.3 tldextract>=1.5.1 feedfinder2>=0.0.4 jieba3k>=0.35.1 From 07871f8ce706844802edc19a3e5cbbfb291994bd Mon Sep 17 00:00:00 2001 From: Adam Nelson Date: Fri, 8 Jul 2016 15:10:04 -0400 Subject: [PATCH 2/2] source.parse_feeds() now adds titles to source.feeds --- .travis.yml | 1 - newspaper/__init__.py | 5 +- newspaper/api.py | 5 +- newspaper/configuration.py | 19 ++++--- newspaper/extractors.py | 102 +++++++++++++++++----------------- newspaper/outputformatters.py | 4 +- newspaper/parsers.py | 4 +- newspaper/source.py | 36 +++--------- 8 files changed, 78 insertions(+), 98 deletions(-) diff --git a/.travis.yml b/.travis.yml index 28d9525..21e6e47 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "3.3" - "3.4" - "3.5" install: diff --git a/newspaper/__init__.py b/newspaper/__init__.py index eaef885..3ce5aa4 100644 --- a/newspaper/__init__.py +++ b/newspaper/__init__.py @@ -7,9 +7,10 @@ __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' -from .article import Article, ArticleException from .api import (build, build_article, fulltext, hot, languages, - popular_urls, NewsPool, Configuration as Config) + popular_urls, Configuration as Config) +from .article import Article, ArticleException +from .mthreading import NewsPool from .source import Source from .version import __version__ diff --git a/newspaper/api.py b/newspaper/api.py index f69338f..fb98e81 100644 --- a/newspaper/api.py +++ b/newspaper/api.py @@ -13,13 +13,12 @@ import feedparser from .article import Article from .configuration import Configuration -from .mthreading import NewsPool from .settings import POPULAR_URLS, TRENDING_URL from .source import Source from .utils import extend_config, print_available_languages -def build(url='', dry=False, config=None, **kwargs): +def build(url='', dry=False, config=None, **kwargs) -> Source: """Returns a constructed source object without downloading or parsing the articles """ @@ -32,7 +31,7 @@ def build(url='', dry=False, config=None, **kwargs): return s -def build_article(url='', config=None, **kwargs): +def build_article(url='', config=None, **kwargs) -> Article: """Returns a constructed article object without downloading or parsing """ diff --git a/newspaper/configuration.py b/newspaper/configuration.py index f49c4ce..5a7b5ec 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -21,19 +21,18 @@ log = logging.getLogger(__name__) class Configuration(object): - def __init__(self): """ Modify any of these Article / Source properties TODO: Have a separate ArticleConfig and SourceConfig extend this! """ self.MIN_WORD_COUNT = 300 # num of word tokens in text - self.MIN_SENT_COUNT = 7 # num of sentence tokens - self.MAX_TITLE = 200 # num of chars - self.MAX_TEXT = 100000 # num of chars - self.MAX_KEYWORDS = 35 # num of strings in list - self.MAX_AUTHORS = 10 # num strings in list - self.MAX_SUMMARY = 5000 # num of chars + self.MIN_SENT_COUNT = 7 # num of sentence tokens + self.MAX_TITLE = 200 # num of chars + self.MAX_TEXT = 100000 # num of chars + self.MAX_KEYWORDS = 35 # num of strings in list + self.MAX_AUTHORS = 10 # num strings in list + self.MAX_SUMMARY = 5000 # num of chars self.MAX_SUMMARY_SENT = 5 # num of sentences # max number of urls we cache for each news source @@ -101,7 +100,8 @@ class Configuration(object): language = property(get_language, set_language, del_language, "language prop") - def get_stopwords_class(self, language): + @staticmethod + def get_stopwords_class(language): if language == 'ko': return StopWordsKorean elif language == 'zh': @@ -110,7 +110,8 @@ class Configuration(object): return StopWordsArabic return StopWords - def get_parser(self): + @staticmethod + def get_parser(): return Parser diff --git a/newspaper/extractors.py b/newspaper/extractors.py index f2cb14d..0a763b9 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -11,17 +11,16 @@ __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' -from collections import defaultdict -import copy -from dateutil.parser import parse as date_parser import logging -import re import urllib.parse +from collections import defaultdict +import copy +import re +from dateutil.parser import parse as date_parser from tldextract import tldextract from . import urls - from .utils import StringReplacement, StringSplitter log = logging.getLogger(__name__) @@ -53,7 +52,6 @@ bad_domains = ['amazon', 'doubleclick', 'twitter'] class ContentExtractor(object): - def __init__(self, config): self.config = config self.parser = self.config.get_parser() @@ -61,10 +59,10 @@ class ContentExtractor(object): self.stopwords_class = config.stopwords_class def update_language(self, meta_lang): - '''Required to be called before the extraction process in some + """Required to be called before the extraction process in some cases because the stopwords_class has to set incase the lang is not latin based - ''' + """ if meta_lang: self.language = meta_lang self.stopwords_class = \ @@ -93,10 +91,10 @@ class ContentExtractor(object): return result def parse_byline(search_str): - """Takes a candidate line of html or text and - extracts out the name(s) in list form - >>> search_str('
By: Lucas Ou-Yang, \ - Alex Smith
') + """ + Takes a candidate line of html or text and + extracts out the name(s) in list form: + >>> parse_byline('
By: Lucas Ou-Yang,Alex Smith
') ['Lucas Ou-Yang', 'Alex Smith'] """ # Remove HTML boilerplate @@ -116,10 +114,10 @@ class ContentExtractor(object): _authors = [] # List of first, last name tokens curname = [] - DELIM = ['and', ',', ''] + delimiters = ['and', ',', ''] for token in name_tokens: - if token in DELIM: + if token in delimiters: if len(curname) > 0: _authors.append(' '.join(curname)) curname = [] @@ -184,7 +182,7 @@ class ContentExtractor(object): try: datetime_obj = date_parser(date_str) return datetime_obj - except: + except (ValueError, OverflowError): # near all parse failures are due to URL dates without a day # specifier, e.g. /2014/04/ return None @@ -341,8 +339,8 @@ class ContentExtractor(object): hint = filter_regex.sub('', hint).lower() # find the largest title piece - for i in range(len(title_pieces)): - current = title_pieces[i].strip() + for i, title_piece in enumerate(title_pieces): + current = title_piece.strip() if hint and hint in filter_regex.sub('', current).lower(): large_text_index = i break @@ -393,7 +391,7 @@ class ContentExtractor(object): # look up for a Content-Language in meta items = [ {'tag': 'meta', 'attr': 'http-equiv', - 'value': 'content-language'}, + 'value': 'content-language'}, {'tag': 'meta', 'attr': 'name', 'value': 'lang'} ] for item in items: @@ -409,14 +407,14 @@ class ContentExtractor(object): return None - def get_meta_content(self, doc, metaName): + def get_meta_content(self, doc, metaname): """Extract a given meta content form document. Example metaNames: "meta[name=description]" "meta[name=keywords]" "meta[property=og:type]" """ - meta = self.parser.css_select(doc, metaName) + meta = self.parser.css_select(doc, metaname) content = None if meta is not None and len(meta) > 0: content = self.parser.getAttribute(meta[0], 'content') @@ -540,7 +538,7 @@ class ContentExtractor(object): urls = [img_tag.get('src') for img_tag in img_tags if img_tag.get('src')] img_links = set([urllib.parse.urljoin(article_url, url) - for url in urls]) + for url in urls]) return img_links def get_first_img_url(self, article_url, top_node): @@ -747,7 +745,7 @@ class ContentExtractor(object): for node in nodes_to_check: text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).\ + word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: @@ -760,7 +758,7 @@ class ContentExtractor(object): for node in nodes_with_text: boost_score = float(0) # boost - if(self.is_boostable(node)): + if self.is_boostable(node): if cnt >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 @@ -775,7 +773,7 @@ class ContentExtractor(object): boost_score = float(5) text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).\ + word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) @@ -827,9 +825,9 @@ class ContentExtractor(object): if current_node_tag == para: if steps_away >= max_stepsaway_from_node: return False - paraText = self.parser.getText(current_node) - word_stats = self.stopwords_class(language=self.language).\ - get_stopword_count(paraText) + paragraph_text = self.parser.getText(current_node) + word_stats = self.stopwords_class(language=self.language). \ + get_stopword_count(paragraph_text) if word_stats.get_stopword_count() > minimum_stopword_count: return True steps_away += 1 @@ -844,21 +842,21 @@ class ContentExtractor(object): return b def add_siblings(self, top_node): - baselinescore_siblings_para = self.get_siblings_score(top_node) + baseline_score_siblings_para = self.get_siblings_score(top_node) results = self.walk_siblings(top_node) for current_node in results: ps = self.get_siblings_content( - current_node, baselinescore_siblings_para) + current_node, baseline_score_siblings_para) for p in ps: top_node.insert(0, p) return top_node def get_siblings_content( - self, current_sibling, baselinescore_siblings_para): + self, current_sibling, baseline_score_siblings_para): """Adds any siblings that may have a decent score to this node """ if current_sibling.tag == 'p' and \ - len(self.parser.getText(current_sibling)) > 0: + len(self.parser.getText(current_sibling)) > 0: e0 = current_sibling if e0.tail: e0 = copy.deepcopy(e0) @@ -874,13 +872,13 @@ class ContentExtractor(object): for first_paragraph in potential_paragraphs: text = self.parser.getText(first_paragraph) if len(text) > 0: - word_stats = self.stopwords_class(language=self.language).\ + word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density( first_paragraph) - score = float(baselinescore_siblings_para * + score = float(baseline_score_siblings_para * sibling_baseline_score) if score < paragraph_score and not high_link_density: p = self.parser.createElement( @@ -904,7 +902,7 @@ class ContentExtractor(object): for node in nodes_to_check: text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).\ + word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: @@ -916,7 +914,7 @@ class ContentExtractor(object): return base - def update_score(self, node, addToScore): + def update_score(self, node, add_to_score): """Adds a score to the gravityScore Attribute we put on divs we'll get the current score then add the score we're passing in to the current. @@ -926,7 +924,7 @@ class ContentExtractor(object): if score_string: current_score = float(score_string) - new_score = current_score + addToScore + new_score = current_score + add_to_score self.parser.setAttribute(node, "gravityScore", str(new_score)) def update_node_count(self, node, add_to_count): @@ -957,12 +955,12 @@ class ContentExtractor(object): for link in links: sb.append(self.parser.getText(link)) - linkText = ''.join(sb) - linkWords = linkText.split() - numberOfLinkWords = float(len(linkWords)) - numberOfLinks = float(len(links)) - linkDivisor = float(numberOfLinkWords / words_number) - score = float(linkDivisor * numberOfLinks) + link_text = ''.join(sb) + link_words = link_text.split() + num_link_words = float(len(link_words)) + num_links = float(len(links)) + link_divisor = float(num_link_words / words_number) + score = float(link_divisor * num_links) if score >= 1.0: return True return False @@ -974,10 +972,10 @@ class ContentExtractor(object): return self.get_node_gravity_score(node) or 0 def get_node_gravity_score(self, node): - grvScoreString = self.parser.getAttribute(node, 'gravityScore') - if not grvScoreString: + gravity_score = self.parser.getAttribute(node, 'gravityScore') + if not gravity_score: return None - return float(grvScoreString) + return float(gravity_score) def nodes_to_check(self, doc): """Returns a list of nodes we want to search @@ -990,23 +988,23 @@ class ContentExtractor(object): return nodes_to_check def is_table_and_no_para_exist(self, e): - subParagraphs = self.parser.getElementsByTag(e, tag='p') - for p in subParagraphs: + sub_paragraphs = self.parser.getElementsByTag(e, tag='p') + for p in sub_paragraphs: txt = self.parser.getText(p) if len(txt) < 25: self.parser.remove(p) - subParagraphs2 = self.parser.getElementsByTag(e, tag='p') - if len(subParagraphs2) == 0 and e.tag != "td": + sub_paragraphs_2 = self.parser.getElementsByTag(e, tag='p') + if len(sub_paragraphs_2) == 0 and e.tag != "td": return True return False def is_nodescore_threshold_met(self, node, e): top_node_score = self.get_score(node) - current_nodeScore = self.get_score(e) - thresholdScore = float(top_node_score * .08) + current_node_score = self.get_score(e) + threshold = float(top_node_score * .08) - if (current_nodeScore < thresholdScore) and e.tag != 'td': + if (current_node_score < threshold) and e.tag != 'td': return False return True diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py index 02b8d22..7e7e7e8 100644 --- a/newspaper/outputformatters.py +++ b/newspaper/outputformatters.py @@ -7,7 +7,7 @@ __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' -from html.parser import HTMLParser +from html import unescape import logging from .text import innerTrim @@ -70,7 +70,7 @@ class OutputFormatter(object): txt = None if txt: - txt = HTMLParser().unescape(txt) + txt = unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txt_lis = [n.strip(' ') for n in txt_lis] txts.extend(txt_lis) diff --git a/newspaper/parsers.py b/newspaper/parsers.py index f4c1847..3483fb7 100644 --- a/newspaper/parsers.py +++ b/newspaper/parsers.py @@ -12,7 +12,7 @@ import lxml.html import lxml.html.clean import re import traceback -from html.parser import HTMLParser +from html import unescape from bs4 import UnicodeDammit from copy import deepcopy @@ -247,7 +247,7 @@ class Parser(object): if attr: attr = node.attrib.get(attr, None) if attr: - attr = HTMLParser().unescape(attr) + attr = unescape(attr) return attr @classmethod diff --git a/newspaper/source.py b/newspaper/source.py index ef0cbf5..545fe48 100644 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -10,25 +10,20 @@ __copyright__ = 'Copyright 2014, Lucas Ou-Yang' import logging -import feedparser - from tldextract import tldextract from . import network from . import urls from . import utils - from .article import Article -from .extractors import ContentExtractor from .configuration import Configuration +from .extractors import ContentExtractor from .settings import ANCHOR_DIRECTORY - log = logging.getLogger(__name__) class Category(object): - def __init__(self, url): self.url = url self.html = None @@ -36,7 +31,6 @@ class Category(object): class Feed(object): - def __init__(self, url): self.url = url self.rss = None @@ -52,6 +46,7 @@ class Source(object): articles = [
,
, ..] brand = 'cnn' """ + def __init__(self, url, config=None, **kwargs): """The config object for this source will be passed into all of this source's children articles unless specified otherwise or re-set. @@ -85,7 +80,7 @@ class Source(object): self.is_parsed = False self.is_downloaded = False - def build(self, response=None): + def build(self): """Encapsulates download and basic parsing with lxml. May be a good idea to split this into download() and parse() methods. """ @@ -97,7 +92,7 @@ class Source(object): self.parse_categories() self.set_feeds() - self.download_feeds() # mthread + self.download_feeds() # mthread # self.parse_feeds() self.generate_articles() @@ -118,7 +113,7 @@ class Source(object): return articles @utils.cache_disk(seconds=(86400 * 1), cache_folder=ANCHOR_DIRECTORY) - def _get_category_urls(self, domain): + def _get_category_urls(self): """The domain param is **necessary**, see .utils.cache_disk for reasons. the boilerplate method is so we can use this decorator right. We are caching categories for 1 day. @@ -204,9 +199,10 @@ class Source(object): self.categories = [c for c in self.categories if c.doc is not None] - def _map_title_to_feed(self,feed): + def _map_title_to_feed(self, feed): doc = self.config.get_parser().fromstring(feed.rss) - feed.title = self.config.get_parser().getElementsByTag(doc, tag='title')[0].text or '' + feed.title = self.config.get_parser().getElementsByTag(doc, tag='title')[ + 0].text or self.brand return feed def parse_feeds(self): @@ -243,7 +239,7 @@ class Source(object): if self.config.verbose: print(('%d->%d->%d for %s' % - (before_purge, after_purge, after_memo, feed.url))) + (before_purge, after_purge, after_memo, feed.url))) log.debug('%d->%d->%d for %s' % (before_purge, after_purge, after_memo, feed.url)) return articles @@ -374,20 +370,6 @@ class Source(object): """ return [article.url for article in self.articles] - def get_key(self): - # TODO - pass - - def clear_anchor_directory(self): - """Clears out all files in our directory where we cache anchors - the key is sha1(self.domain).hexdigest() fn is ANCHOR_DIR/key. - """ - pass - # TODO: - # d_pth = os.path.join( - # settings.MEMO_DIR, domain_to_filename(source_domain)) - # os.path.remove(ANCHOR_DIRECTORY) - def print_summary(self): """Prints out a summary of the data in our source instance """