added final test cases, refactored code, modified source, article

This commit is contained in:
Lucas Ou-Yang 2013-12-20 11:02:17 -08:00
parent 90e22d709c
commit e5bfad5d42
5 changed files with 58 additions and 48 deletions

View file

@ -19,7 +19,7 @@ Newspaper utilizes async io and caching for speed. *Also, everything is in unico
The core 3 methods are:
* ``download()`` retrieves the html, with non blocking io whenever possible.
* ``download()`` retrieves the html, with multithreading whenever possible.
* ``parse()`` extracts the body text, authors, titles, etc from the html.
* ``nlp()`` extracts the summaries, keywords, sentiments from the text.
@ -29,7 +29,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
>>> import newspaper
>>> cnn_paper = newspaper.build('http://cnn.com')
>>> cnn_paper = newspaper.build('http://cnn.com') # this takes 10 seconds ish
>>> for article in cnn_paper.articles:
>>> print article.url
@ -38,25 +38,30 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
u'http://www.cnn.com/2013/12/07/us/life-pearl-harbor/?iref=obinsite'
...
>>> print cnn_paper.category_urls
>>> print cnn_paper.size() # number of articles we extracted and cached
3100
# category & feed urls extracted once, then cached for a day (adjustable)
>>> print cnn_paper.category_urls()
[u'http://lifestyle.cnn.com', u'http://cnn.com/world', u'http://tech.cnn.com' ...]
>>> print cnn_paper.feed_urls
>>> print cnn_paper.feed_urls()
[u'http://rss.cnn.com/rss/cnn_crime.rss', u'http://rss.cnn.com/rss/cnn_tech.rss', ...]
#### download html for all articles **concurrently**
>>> cnn_paper.download()
#### build articles, then download, parse, and perform NLP
>>> for article in cnn_paper.articles[:5]:
article.download()
>>> print cnn_paper.articles[0].html
u'<!DOCTYPE HTML><html itemscope itemtype="http://...'
>>> print cnn_paper.articles[5].html
u'<!DOCTYPE HTML><html itemscope itemtype="http://...'
>>> print cnn_paper.articles[7].html
u'' # we only decided to download 5 articles
#### parse html on a per article basis **not concurrent**
>>> cnn_paper.articles[0].parse()
### parse an article for it's body text, top image, authors, and title
>>> cnn_paper.articles[0].parse() # just one article this time
>>> print cnn_paper.articles[0].text
u'Three sisters who were imprisoned for possibly...'
@ -71,7 +76,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
u'Police: 3 sisters imprisoned in Tucson home'
#### extract nlp on a per article basis **not concurrent**
#### extract nlp (must be on an already parsed article
>>> cnn_paper.articles[0].nlp()
>>> print cnn_paper.articles[0].summary
@ -80,22 +85,20 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
>>> print cnn_paper.articles[0].keywords
[u'music', u'Tucson', ... ]
# not we try nlp() on an article that has not been downloaded
>>> print cnn_paper.articles[100].nlp()
Traceback (...
...
ArticleException: You must parse an article before you try to nlpify is
#### some other news-source level functionality
>>> print cnn_paper.brand
u'cnn'
## Alternatively, parse and nlp all articles together. Will take a while...
##
## for article in cnn_paper.articles:
## article.parse()
## article.nlp()
##
## You could even download() articles on a per article basis but
## that becomes very slow because it wont be concurrent.
##
## for article in cnn_paper.articles:
## article.download()
>>> print cnn_paper.description
u'CNN.com delivers the latest breaking news and information on the latest...'
Alternatively, you may use newspaper's lower level Article api.
@ -117,6 +120,12 @@ Alternatively, you may use newspaper's lower level Article api.
>>> print article.authors
[u'Martha Stewart', u'Bob Smith']
>>> print article.top_img
u'http://some.cdn.com/3424hfd4565sdfgdg436/
>>> print article.title
u'Thanksgiving Weather Guide Travel ...'
>>> article.nlp()
>>> print article.summary

View file

@ -118,7 +118,8 @@ def keywords(text):
import operator # sorting
text = split_words(text)
numWords = len(text) # of words before removing blacklist words
# of words before removing blacklist words
num_words = len(text)
text = [x for x in text if x not in stopwords]
freq = Counter()
for word in text:
@ -129,7 +130,7 @@ def keywords(text):
keywords = dict((x,y) for x, y in keywords) # recreate a dict
for k in keywords:
articleScore = keywords[k]*1.0 / numWords
articleScore = keywords[k]*1.0 / num_words
keywords[k] = articleScore * 1.5 + 1
keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1))

View file

@ -10,7 +10,7 @@ VERSION = '0.0.1'
PARENT_DIR = os.path.dirname(os.path.abspath(__file__))
POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_urls.txt')
POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_sources.txt')
USERAGENTS_FN = os.path.join(PARENT_DIR, 'data/useragents.txt')
STOPWORDS_EN_FN = os.path.join(PARENT_DIR, 'data/stopwords_en.txt')
STOPWORDS_EN_FN_2 = os.path.join(PARENT_DIR, 'data/stopwords_en2.txt')

View file

@ -79,11 +79,11 @@ class Source(object):
# Can not merge category and feed tasks together because
# computing feed urls relies on the category urls!
self.set_category_urls()
self.set_categories()
self.download_categories() # mthread
self.parse_categories()
self.set_feed_urls()
self.set_feeds()
self.download_feeds() # mthread
# self.parse_feeds() # TODO regexing out feeds until fix feedparser!
@ -96,7 +96,7 @@ class Source(object):
# TODO TODO Figure out why using the 'del' command on input list reference
# isn't actually filtering the list?!
#cur_articles = self.articles if in_articles is None else in_articles
# cur_articles = self.articles if in_articles is None else in_articles
new_articles = []
for index, article in enumerate(in_articles):
@ -127,13 +127,13 @@ class Source(object):
return parsers.get_category_urls(self)
def set_category_urls(self):
def set_categories(self):
""""""
urls = self._get_category_urls(self.domain)
self.categories = [Category(url=url) for url in urls]
def set_feed_urls(self):
def set_feeds(self):
"""don't need to cache getting feed urls, it's almost
instant w/ xpath"""
@ -324,7 +324,7 @@ class Source(object):
# log.critical('total', len(articles), 'articles and cutoff was at', limit)
@print_duration
# @print_duration
def download_articles(self, multithread=False):
"""downloads all articles attached to self"""
@ -378,17 +378,17 @@ class Source(object):
clear_memo_cache(self)
def get_feed_urls(self):
def feed_urls(self):
"""
"""
return [feed.url for feed in self.feeds]
def get_category_urls(self):
def category_urls(self):
"""
"""
return [category.url for category in self.categories]
def get_article_urls(self):
def article_urls(self):
"""
"""
@ -413,6 +413,6 @@ class Source(object):
print '\t[len of html]:', len(a.html)
print '\t=============='
print 'feed_urls:', self.get_feed_urls()
print 'feed_urls:', self.feed_urls()
print '\r\n'
print 'category_urls:', self.get_category_urls()
print 'category_urls:', self.category_urls()

View file

@ -167,21 +167,18 @@ class SourceTestCase(unittest.TestCase):
def test_cache_categories(self):
"""builds two same source objects in a row examines speeds of both"""
def wrap_category_urls(source):
source.set_category_urls()
s = Source('http://yahoo.com')
s.download()
s.parse()
s.set_categories()
wrap_category_urls(s)
saved_urls = s.get_category_urls()
saved_urls = s.category_urls()
s.categories = [] # reset and try again with caching
s.category_urls = [] # reset and try again with caching
wrap_category_urls(s)
s.set_categories()
assert sorted(s.get_category_urls()) == sorted(saved_urls)
# print '[CATEGORIES]', s.get_category_urls()
assert sorted(s.category_urls()) == sorted(saved_urls)
# print '[CATEGORIES]', s.category_urls()
class UrlTestCase(unittest.TestCase):
def runTest(self):
@ -223,6 +220,8 @@ class APITestCase(unittest.TestCase):
print 'testing API unit'
self.test_source_build()
self.test_article_build()
self.test_hot_trending()
self.test_popular_urls()
@print_test
def test_source_build(self):
@ -243,14 +242,15 @@ class APITestCase(unittest.TestCase):
@print_test
def test_hot_trending(self):
"""grab google trending"""
"""grab google trending, just make sure this runs"""
print newspaper.hot()
newspaper.hot()
@print_test
def test_popular_urls(self):
"""just make sure this runs"""
print newspaper.popular_urls()
newspaper.popular_urls()
if __name__ == '__main__':
# unittest.main() # run all units and their cases