diff --git a/README.rst b/README.rst index 1b6a388..a6b3d5d 100644 --- a/README.rst +++ b/README.rst @@ -19,7 +19,7 @@ Newspaper utilizes async io and caching for speed. *Also, everything is in unico The core 3 methods are: -* ``download()`` retrieves the html, with non blocking io whenever possible. +* ``download()`` retrieves the html, with multithreading whenever possible. * ``parse()`` extracts the body text, authors, titles, etc from the html. * ``nlp()`` extracts the summaries, keywords, sentiments from the text. @@ -29,7 +29,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o >>> import newspaper - >>> cnn_paper = newspaper.build('http://cnn.com') + >>> cnn_paper = newspaper.build('http://cnn.com') # this takes 10 seconds ish >>> for article in cnn_paper.articles: >>> print article.url @@ -38,25 +38,30 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o u'http://www.cnn.com/2013/12/07/us/life-pearl-harbor/?iref=obinsite' ... - >>> print cnn_paper.category_urls + >>> print cnn_paper.size() # number of articles we extracted and cached + 3100 + + # category & feed urls extracted once, then cached for a day (adjustable) + >>> print cnn_paper.category_urls() [u'http://lifestyle.cnn.com', u'http://cnn.com/world', u'http://tech.cnn.com' ...] - >>> print cnn_paper.feed_urls + >>> print cnn_paper.feed_urls() [u'http://rss.cnn.com/rss/cnn_crime.rss', u'http://rss.cnn.com/rss/cnn_tech.rss', ...] - #### download html for all articles **concurrently** - >>> cnn_paper.download() + #### build articles, then download, parse, and perform NLP + >>> for article in cnn_paper.articles[:5]: + article.download() >>> print cnn_paper.articles[0].html u'>> print cnn_paper.articles[7].html + u'' # we only decided to download 5 articles - #### parse html on a per article basis **not concurrent** - >>> cnn_paper.articles[0].parse() + ### parse an article for it's body text, top image, authors, and title + >>> cnn_paper.articles[0].parse() # just one article this time >>> print cnn_paper.articles[0].text u'Three sisters who were imprisoned for possibly...' @@ -71,7 +76,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o u'Police: 3 sisters imprisoned in Tucson home' - #### extract nlp on a per article basis **not concurrent** + #### extract nlp (must be on an already parsed article >>> cnn_paper.articles[0].nlp() >>> print cnn_paper.articles[0].summary @@ -80,22 +85,20 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o >>> print cnn_paper.articles[0].keywords [u'music', u'Tucson', ... ] + # not we try nlp() on an article that has not been downloaded + >>> print cnn_paper.articles[100].nlp() + Traceback (... + ... + ArticleException: You must parse an article before you try to nlpify is + #### some other news-source level functionality >>> print cnn_paper.brand u'cnn' - ## Alternatively, parse and nlp all articles together. Will take a while... - ## - ## for article in cnn_paper.articles: - ## article.parse() - ## article.nlp() - ## - ## You could even download() articles on a per article basis but - ## that becomes very slow because it wont be concurrent. - ## - ## for article in cnn_paper.articles: - ## article.download() + >>> print cnn_paper.description + u'CNN.com delivers the latest breaking news and information on the latest...' + Alternatively, you may use newspaper's lower level Article api. @@ -117,6 +120,12 @@ Alternatively, you may use newspaper's lower level Article api. >>> print article.authors [u'Martha Stewart', u'Bob Smith'] + >>> print article.top_img + u'http://some.cdn.com/3424hfd4565sdfgdg436/ + + >>> print article.title + u'Thanksgiving Weather Guide Travel ...' + >>> article.nlp() >>> print article.summary diff --git a/newspaper/nlp.py b/newspaper/nlp.py index ae3f907..160bc90 100644 --- a/newspaper/nlp.py +++ b/newspaper/nlp.py @@ -118,7 +118,8 @@ def keywords(text): import operator # sorting text = split_words(text) - numWords = len(text) # of words before removing blacklist words + # of words before removing blacklist words + num_words = len(text) text = [x for x in text if x not in stopwords] freq = Counter() for word in text: @@ -129,7 +130,7 @@ def keywords(text): keywords = dict((x,y) for x, y in keywords) # recreate a dict for k in keywords: - articleScore = keywords[k]*1.0 / numWords + articleScore = keywords[k]*1.0 / num_words keywords[k] = articleScore * 1.5 + 1 keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1)) diff --git a/newspaper/settings.py b/newspaper/settings.py index d62df9b..9ba3b14 100644 --- a/newspaper/settings.py +++ b/newspaper/settings.py @@ -10,7 +10,7 @@ VERSION = '0.0.1' PARENT_DIR = os.path.dirname(os.path.abspath(__file__)) -POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_urls.txt') +POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_sources.txt') USERAGENTS_FN = os.path.join(PARENT_DIR, 'data/useragents.txt') STOPWORDS_EN_FN = os.path.join(PARENT_DIR, 'data/stopwords_en.txt') STOPWORDS_EN_FN_2 = os.path.join(PARENT_DIR, 'data/stopwords_en2.txt') diff --git a/newspaper/source.py b/newspaper/source.py index df41036..71d4913 100644 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -79,11 +79,11 @@ class Source(object): # Can not merge category and feed tasks together because # computing feed urls relies on the category urls! - self.set_category_urls() + self.set_categories() self.download_categories() # mthread self.parse_categories() - self.set_feed_urls() + self.set_feeds() self.download_feeds() # mthread # self.parse_feeds() # TODO regexing out feeds until fix feedparser! @@ -96,7 +96,7 @@ class Source(object): # TODO TODO Figure out why using the 'del' command on input list reference # isn't actually filtering the list?! - #cur_articles = self.articles if in_articles is None else in_articles + # cur_articles = self.articles if in_articles is None else in_articles new_articles = [] for index, article in enumerate(in_articles): @@ -127,13 +127,13 @@ class Source(object): return parsers.get_category_urls(self) - def set_category_urls(self): + def set_categories(self): """""" urls = self._get_category_urls(self.domain) self.categories = [Category(url=url) for url in urls] - def set_feed_urls(self): + def set_feeds(self): """don't need to cache getting feed urls, it's almost instant w/ xpath""" @@ -324,7 +324,7 @@ class Source(object): # log.critical('total', len(articles), 'articles and cutoff was at', limit) - @print_duration + # @print_duration def download_articles(self, multithread=False): """downloads all articles attached to self""" @@ -378,17 +378,17 @@ class Source(object): clear_memo_cache(self) - def get_feed_urls(self): + def feed_urls(self): """ """ return [feed.url for feed in self.feeds] - def get_category_urls(self): + def category_urls(self): """ """ return [category.url for category in self.categories] - def get_article_urls(self): + def article_urls(self): """ """ @@ -413,6 +413,6 @@ class Source(object): print '\t[len of html]:', len(a.html) print '\t==============' - print 'feed_urls:', self.get_feed_urls() + print 'feed_urls:', self.feed_urls() print '\r\n' - print 'category_urls:', self.get_category_urls() + print 'category_urls:', self.category_urls() diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 8bf8dd3..d07d3b0 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -167,21 +167,18 @@ class SourceTestCase(unittest.TestCase): def test_cache_categories(self): """builds two same source objects in a row examines speeds of both""" - def wrap_category_urls(source): - source.set_category_urls() - s = Source('http://yahoo.com') s.download() s.parse() + s.set_categories() - wrap_category_urls(s) - saved_urls = s.get_category_urls() + saved_urls = s.category_urls() + s.categories = [] # reset and try again with caching - s.category_urls = [] # reset and try again with caching - wrap_category_urls(s) + s.set_categories() - assert sorted(s.get_category_urls()) == sorted(saved_urls) - # print '[CATEGORIES]', s.get_category_urls() + assert sorted(s.category_urls()) == sorted(saved_urls) + # print '[CATEGORIES]', s.category_urls() class UrlTestCase(unittest.TestCase): def runTest(self): @@ -223,6 +220,8 @@ class APITestCase(unittest.TestCase): print 'testing API unit' self.test_source_build() self.test_article_build() + self.test_hot_trending() + self.test_popular_urls() @print_test def test_source_build(self): @@ -243,14 +242,15 @@ class APITestCase(unittest.TestCase): @print_test def test_hot_trending(self): - """grab google trending""" + """grab google trending, just make sure this runs""" - print newspaper.hot() + newspaper.hot() @print_test def test_popular_urls(self): + """just make sure this runs""" - print newspaper.popular_urls() + newspaper.popular_urls() if __name__ == '__main__': # unittest.main() # run all units and their cases