mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
added final test cases, refactored code, modified source, article
This commit is contained in:
parent
90e22d709c
commit
e5bfad5d42
5 changed files with 58 additions and 48 deletions
53
README.rst
53
README.rst
|
|
@ -19,7 +19,7 @@ Newspaper utilizes async io and caching for speed. *Also, everything is in unico
|
|||
|
||||
The core 3 methods are:
|
||||
|
||||
* ``download()`` retrieves the html, with non blocking io whenever possible.
|
||||
* ``download()`` retrieves the html, with multithreading whenever possible.
|
||||
* ``parse()`` extracts the body text, authors, titles, etc from the html.
|
||||
* ``nlp()`` extracts the summaries, keywords, sentiments from the text.
|
||||
|
||||
|
|
@ -29,7 +29,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
|
|||
|
||||
>>> import newspaper
|
||||
|
||||
>>> cnn_paper = newspaper.build('http://cnn.com')
|
||||
>>> cnn_paper = newspaper.build('http://cnn.com') # this takes 10 seconds ish
|
||||
|
||||
>>> for article in cnn_paper.articles:
|
||||
>>> print article.url
|
||||
|
|
@ -38,25 +38,30 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
|
|||
u'http://www.cnn.com/2013/12/07/us/life-pearl-harbor/?iref=obinsite'
|
||||
...
|
||||
|
||||
>>> print cnn_paper.category_urls
|
||||
>>> print cnn_paper.size() # number of articles we extracted and cached
|
||||
3100
|
||||
|
||||
# category & feed urls extracted once, then cached for a day (adjustable)
|
||||
>>> print cnn_paper.category_urls()
|
||||
[u'http://lifestyle.cnn.com', u'http://cnn.com/world', u'http://tech.cnn.com' ...]
|
||||
|
||||
>>> print cnn_paper.feed_urls
|
||||
>>> print cnn_paper.feed_urls()
|
||||
[u'http://rss.cnn.com/rss/cnn_crime.rss', u'http://rss.cnn.com/rss/cnn_tech.rss', ...]
|
||||
|
||||
|
||||
#### download html for all articles **concurrently**
|
||||
>>> cnn_paper.download()
|
||||
#### build articles, then download, parse, and perform NLP
|
||||
>>> for article in cnn_paper.articles[:5]:
|
||||
article.download()
|
||||
|
||||
>>> print cnn_paper.articles[0].html
|
||||
u'<!DOCTYPE HTML><html itemscope itemtype="http://...'
|
||||
|
||||
>>> print cnn_paper.articles[5].html
|
||||
u'<!DOCTYPE HTML><html itemscope itemtype="http://...'
|
||||
>>> print cnn_paper.articles[7].html
|
||||
u'' # we only decided to download 5 articles
|
||||
|
||||
|
||||
#### parse html on a per article basis **not concurrent**
|
||||
>>> cnn_paper.articles[0].parse()
|
||||
### parse an article for it's body text, top image, authors, and title
|
||||
>>> cnn_paper.articles[0].parse() # just one article this time
|
||||
|
||||
>>> print cnn_paper.articles[0].text
|
||||
u'Three sisters who were imprisoned for possibly...'
|
||||
|
|
@ -71,7 +76,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
|
|||
u'Police: 3 sisters imprisoned in Tucson home'
|
||||
|
||||
|
||||
#### extract nlp on a per article basis **not concurrent**
|
||||
#### extract nlp (must be on an already parsed article
|
||||
>>> cnn_paper.articles[0].nlp()
|
||||
|
||||
>>> print cnn_paper.articles[0].summary
|
||||
|
|
@ -80,22 +85,20 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
|
|||
>>> print cnn_paper.articles[0].keywords
|
||||
[u'music', u'Tucson', ... ]
|
||||
|
||||
# not we try nlp() on an article that has not been downloaded
|
||||
>>> print cnn_paper.articles[100].nlp()
|
||||
Traceback (...
|
||||
...
|
||||
ArticleException: You must parse an article before you try to nlpify is
|
||||
|
||||
|
||||
#### some other news-source level functionality
|
||||
>>> print cnn_paper.brand
|
||||
u'cnn'
|
||||
|
||||
## Alternatively, parse and nlp all articles together. Will take a while...
|
||||
##
|
||||
## for article in cnn_paper.articles:
|
||||
## article.parse()
|
||||
## article.nlp()
|
||||
##
|
||||
## You could even download() articles on a per article basis but
|
||||
## that becomes very slow because it wont be concurrent.
|
||||
##
|
||||
## for article in cnn_paper.articles:
|
||||
## article.download()
|
||||
>>> print cnn_paper.description
|
||||
u'CNN.com delivers the latest breaking news and information on the latest...'
|
||||
|
||||
|
||||
Alternatively, you may use newspaper's lower level Article api.
|
||||
|
||||
|
|
@ -117,6 +120,12 @@ Alternatively, you may use newspaper's lower level Article api.
|
|||
>>> print article.authors
|
||||
[u'Martha Stewart', u'Bob Smith']
|
||||
|
||||
>>> print article.top_img
|
||||
u'http://some.cdn.com/3424hfd4565sdfgdg436/
|
||||
|
||||
>>> print article.title
|
||||
u'Thanksgiving Weather Guide Travel ...'
|
||||
|
||||
>>> article.nlp()
|
||||
|
||||
>>> print article.summary
|
||||
|
|
|
|||
|
|
@ -118,7 +118,8 @@ def keywords(text):
|
|||
|
||||
import operator # sorting
|
||||
text = split_words(text)
|
||||
numWords = len(text) # of words before removing blacklist words
|
||||
# of words before removing blacklist words
|
||||
num_words = len(text)
|
||||
text = [x for x in text if x not in stopwords]
|
||||
freq = Counter()
|
||||
for word in text:
|
||||
|
|
@ -129,7 +130,7 @@ def keywords(text):
|
|||
keywords = dict((x,y) for x, y in keywords) # recreate a dict
|
||||
|
||||
for k in keywords:
|
||||
articleScore = keywords[k]*1.0 / numWords
|
||||
articleScore = keywords[k]*1.0 / num_words
|
||||
keywords[k] = articleScore * 1.5 + 1
|
||||
|
||||
keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1))
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ VERSION = '0.0.1'
|
|||
|
||||
PARENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_urls.txt')
|
||||
POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_sources.txt')
|
||||
USERAGENTS_FN = os.path.join(PARENT_DIR, 'data/useragents.txt')
|
||||
STOPWORDS_EN_FN = os.path.join(PARENT_DIR, 'data/stopwords_en.txt')
|
||||
STOPWORDS_EN_FN_2 = os.path.join(PARENT_DIR, 'data/stopwords_en2.txt')
|
||||
|
|
|
|||
|
|
@ -79,11 +79,11 @@ class Source(object):
|
|||
|
||||
# Can not merge category and feed tasks together because
|
||||
# computing feed urls relies on the category urls!
|
||||
self.set_category_urls()
|
||||
self.set_categories()
|
||||
self.download_categories() # mthread
|
||||
self.parse_categories()
|
||||
|
||||
self.set_feed_urls()
|
||||
self.set_feeds()
|
||||
self.download_feeds() # mthread
|
||||
# self.parse_feeds() # TODO regexing out feeds until fix feedparser!
|
||||
|
||||
|
|
@ -96,7 +96,7 @@ class Source(object):
|
|||
|
||||
# TODO TODO Figure out why using the 'del' command on input list reference
|
||||
# isn't actually filtering the list?!
|
||||
#cur_articles = self.articles if in_articles is None else in_articles
|
||||
# cur_articles = self.articles if in_articles is None else in_articles
|
||||
new_articles = []
|
||||
|
||||
for index, article in enumerate(in_articles):
|
||||
|
|
@ -127,13 +127,13 @@ class Source(object):
|
|||
|
||||
return parsers.get_category_urls(self)
|
||||
|
||||
def set_category_urls(self):
|
||||
def set_categories(self):
|
||||
""""""
|
||||
|
||||
urls = self._get_category_urls(self.domain)
|
||||
self.categories = [Category(url=url) for url in urls]
|
||||
|
||||
def set_feed_urls(self):
|
||||
def set_feeds(self):
|
||||
"""don't need to cache getting feed urls, it's almost
|
||||
instant w/ xpath"""
|
||||
|
||||
|
|
@ -324,7 +324,7 @@ class Source(object):
|
|||
|
||||
# log.critical('total', len(articles), 'articles and cutoff was at', limit)
|
||||
|
||||
@print_duration
|
||||
# @print_duration
|
||||
def download_articles(self, multithread=False):
|
||||
"""downloads all articles attached to self"""
|
||||
|
||||
|
|
@ -378,17 +378,17 @@ class Source(object):
|
|||
|
||||
clear_memo_cache(self)
|
||||
|
||||
def get_feed_urls(self):
|
||||
def feed_urls(self):
|
||||
"""
|
||||
"""
|
||||
return [feed.url for feed in self.feeds]
|
||||
|
||||
def get_category_urls(self):
|
||||
def category_urls(self):
|
||||
"""
|
||||
"""
|
||||
return [category.url for category in self.categories]
|
||||
|
||||
def get_article_urls(self):
|
||||
def article_urls(self):
|
||||
"""
|
||||
"""
|
||||
|
||||
|
|
@ -413,6 +413,6 @@ class Source(object):
|
|||
print '\t[len of html]:', len(a.html)
|
||||
print '\t=============='
|
||||
|
||||
print 'feed_urls:', self.get_feed_urls()
|
||||
print 'feed_urls:', self.feed_urls()
|
||||
print '\r\n'
|
||||
print 'category_urls:', self.get_category_urls()
|
||||
print 'category_urls:', self.category_urls()
|
||||
|
|
|
|||
|
|
@ -167,21 +167,18 @@ class SourceTestCase(unittest.TestCase):
|
|||
def test_cache_categories(self):
|
||||
"""builds two same source objects in a row examines speeds of both"""
|
||||
|
||||
def wrap_category_urls(source):
|
||||
source.set_category_urls()
|
||||
|
||||
s = Source('http://yahoo.com')
|
||||
s.download()
|
||||
s.parse()
|
||||
s.set_categories()
|
||||
|
||||
wrap_category_urls(s)
|
||||
saved_urls = s.get_category_urls()
|
||||
saved_urls = s.category_urls()
|
||||
s.categories = [] # reset and try again with caching
|
||||
|
||||
s.category_urls = [] # reset and try again with caching
|
||||
wrap_category_urls(s)
|
||||
s.set_categories()
|
||||
|
||||
assert sorted(s.get_category_urls()) == sorted(saved_urls)
|
||||
# print '[CATEGORIES]', s.get_category_urls()
|
||||
assert sorted(s.category_urls()) == sorted(saved_urls)
|
||||
# print '[CATEGORIES]', s.category_urls()
|
||||
|
||||
class UrlTestCase(unittest.TestCase):
|
||||
def runTest(self):
|
||||
|
|
@ -223,6 +220,8 @@ class APITestCase(unittest.TestCase):
|
|||
print 'testing API unit'
|
||||
self.test_source_build()
|
||||
self.test_article_build()
|
||||
self.test_hot_trending()
|
||||
self.test_popular_urls()
|
||||
|
||||
@print_test
|
||||
def test_source_build(self):
|
||||
|
|
@ -243,14 +242,15 @@ class APITestCase(unittest.TestCase):
|
|||
|
||||
@print_test
|
||||
def test_hot_trending(self):
|
||||
"""grab google trending"""
|
||||
"""grab google trending, just make sure this runs"""
|
||||
|
||||
print newspaper.hot()
|
||||
newspaper.hot()
|
||||
|
||||
@print_test
|
||||
def test_popular_urls(self):
|
||||
"""just make sure this runs"""
|
||||
|
||||
print newspaper.popular_urls()
|
||||
newspaper.popular_urls()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# unittest.main() # run all units and their cases
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue