added final test cases, refactored code, modified source, article

2025-12-23 05:36:50 +00:00 · 2013-12-20 11:02:17 -08:00 · 2013-12-20 11:02:17 -08:00 · e5bfad5d42
commit e5bfad5d42
parent 90e22d709c
5 changed files with 58 additions and 48 deletions
--- a/README.rst
+++ b/README.rst
@ -19,7 +19,7 @@ Newspaper utilizes async io and caching for speed. *Also, everything is in unico

 The core 3 methods are:

-* ``download()`` retrieves the html, with non blocking io whenever possible.
+* ``download()`` retrieves the html, with multithreading whenever possible.
 * ``parse()`` extracts the body text, authors, titles, etc from the html.
 * ``nlp()`` extracts the summaries, keywords, sentiments from the text.

@ -29,7 +29,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o

    >>> import newspaper

-    >>> cnn_paper = newspaper.build('http://cnn.com')
+    >>> cnn_paper = newspaper.build('http://cnn.com') # this takes 10 seconds ish

    >>> for article in cnn_paper.articles: 
    >>>     print article.url
@ -38,25 +38,30 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
    u'http://www.cnn.com/2013/12/07/us/life-pearl-harbor/?iref=obinsite'
    ...

-    >>> print cnn_paper.category_urls    
+    >>> print cnn_paper.size() # number of articles we extracted and cached
+    3100 
+
+    # category & feed urls extracted once, then cached for a day (adjustable) 
+    >>> print cnn_paper.category_urls() 
    [u'http://lifestyle.cnn.com', u'http://cnn.com/world', u'http://tech.cnn.com' ...]

-    >>> print cnn_paper.feed_urls  
+    >>> print cnn_paper.feed_urls()  
    [u'http://rss.cnn.com/rss/cnn_crime.rss', u'http://rss.cnn.com/rss/cnn_tech.rss', ...] 
    

-    #### download html for all articles **concurrently**
-    >>> cnn_paper.download() 
+    #### build articles, then download, parse, and perform NLP 
+    >>> for article in cnn_paper.articles[:5]:
+         article.download() 

    >>> print cnn_paper.articles[0].html
    u'<!DOCTYPE HTML><html itemscope itemtype="http://...'

-    >>> print cnn_paper.articles[5].html 
-    u'<!DOCTYPE HTML><html itemscope itemtype="http://...'
+    >>> print cnn_paper.articles[7].html 
+    u'' # we only decided to download 5 articles


-    #### parse html on a per article basis **not concurrent**
-    >>> cnn_paper.articles[0].parse() 
+    ### parse an article for it's body text, top image, authors, and title
+    >>> cnn_paper.articles[0].parse() # just one article this time

    >>> print cnn_paper.articles[0].text
    u'Three sisters who were imprisoned for possibly...'
@ -71,7 +76,7 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
    u'Police: 3 sisters imprisoned in Tucson home'


-    #### extract nlp on a per article basis **not concurrent**
+    #### extract nlp (must be on an already parsed article
    >>> cnn_paper.articles[0].nlp()

    >>> print cnn_paper.articles[0].summary
@ -80,22 +85,20 @@ There are two API's available. Low level ``article`` objects and ``newspaper`` o
    >>> print cnn_paper.articles[0].keywords
    [u'music', u'Tucson', ... ]

+    # not we try nlp() on an article that has not been downloaded
+    >>> print cnn_paper.articles[100].nlp()
+    Traceback (...
+       ...
+    ArticleException: You must parse an article before you try to nlpify is
+

    #### some other news-source level functionality
    >>> print cnn_paper.brand
    u'cnn'

-    ## Alternatively, parse and nlp all articles together. Will take a while...
-    ##
-    ## for article in cnn_paper.articles:
-    ##     article.parse() 
-    ##     article.nlp()
-    ##
-    ## You could even download() articles on a per article basis but
-    ## that becomes very slow because it wont be concurrent.
-    ##
-    ## for article in cnn_paper.articles:
-    ##     article.download()
+    >>> print cnn_paper.description
+    u'CNN.com delivers the latest breaking news and information on the latest...'
+

 Alternatively, you may use newspaper's lower level Article api.

@ -117,6 +120,12 @@ Alternatively, you may use newspaper's lower level Article api.
    >>> print article.authors
    [u'Martha Stewart', u'Bob Smith']

+    >>> print article.top_img
+    u'http://some.cdn.com/3424hfd4565sdfgdg436/
+
+    >>> print article.title
+    u'Thanksgiving Weather Guide Travel ...'
+
    >>> article.nlp()
           
    >>> print article.summary
--- a/newspaper/nlp.py
+++ b/newspaper/nlp.py
@ -118,7 +118,8 @@ def keywords(text):

    import operator # sorting
    text = split_words(text)
-    numWords = len(text) # of words before removing blacklist words
+    # of words before removing blacklist words
+    num_words = len(text)
    text = [x for x in text if x not in stopwords]
    freq = Counter()
    for word in text:
@ -129,7 +130,7 @@ def keywords(text):
    keywords = dict((x,y) for x, y in keywords) # recreate a dict

    for k in keywords:
-        articleScore = keywords[k]*1.0 / numWords
+        articleScore = keywords[k]*1.0 / num_words
        keywords[k] = articleScore * 1.5 + 1

    keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1))
--- a/newspaper/settings.py
+++ b/newspaper/settings.py
@ -10,7 +10,7 @@ VERSION = '0.0.1'

 PARENT_DIR = os.path.dirname(os.path.abspath(__file__))

-POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_urls.txt')
+POP_URLS_FILEN = os.path.join(PARENT_DIR, 'data/popular_sources.txt')
 USERAGENTS_FN = os.path.join(PARENT_DIR, 'data/useragents.txt')
 STOPWORDS_EN_FN = os.path.join(PARENT_DIR, 'data/stopwords_en.txt')
 STOPWORDS_EN_FN_2 = os.path.join(PARENT_DIR, 'data/stopwords_en2.txt')
--- a/newspaper/source.py
+++ b/newspaper/source.py
@ -79,11 +79,11 @@ class Source(object):

        # Can not merge category and feed tasks together because
        # computing feed urls relies on the category urls!
-        self.set_category_urls()
+        self.set_categories()
        self.download_categories() # mthread
        self.parse_categories()

-        self.set_feed_urls()
+        self.set_feeds()
        self.download_feeds()      # mthread
        # self.parse_feeds()       # TODO regexing out feeds until fix feedparser!

@ -96,7 +96,7 @@ class Source(object):

        # TODO TODO Figure out why using the 'del' command on input list reference
        # isn't actually filtering the list?!
-        #cur_articles = self.articles if in_articles is None else in_articles
+        # cur_articles = self.articles if in_articles is None else in_articles
        new_articles = []

        for index, article in enumerate(in_articles):
@ -127,13 +127,13 @@ class Source(object):

        return parsers.get_category_urls(self)

-    def set_category_urls(self):
+    def set_categories(self):
        """"""

        urls = self._get_category_urls(self.domain)
        self.categories = [Category(url=url) for url in urls]

-    def set_feed_urls(self):
+    def set_feeds(self):
        """don't need to cache getting feed urls, it's almost
        instant w/ xpath"""

@ -324,7 +324,7 @@ class Source(object):

        # log.critical('total', len(articles), 'articles and cutoff was at', limit)

-    @print_duration
+    # @print_duration
    def download_articles(self, multithread=False):
        """downloads all articles attached to self"""

@ -378,17 +378,17 @@ class Source(object):

        clear_memo_cache(self)

-    def get_feed_urls(self):
+    def feed_urls(self):
        """
        """
        return [feed.url for feed in self.feeds]

-    def get_category_urls(self):
+    def category_urls(self):
        """
        """
        return [category.url for category in self.categories]

-    def get_article_urls(self):
+    def article_urls(self):
        """
        """

@ -413,6 +413,6 @@ class Source(object):
            print '\t[len of html]:', len(a.html)
            print '\t=============='

-        print 'feed_urls:', self.get_feed_urls()
+        print 'feed_urls:', self.feed_urls()
        print '\r\n'
-        print 'category_urls:', self.get_category_urls()
+        print 'category_urls:', self.category_urls()
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@ -167,21 +167,18 @@ class SourceTestCase(unittest.TestCase):
    def test_cache_categories(self):
        """builds two same source objects in a row examines speeds of both"""

-        def wrap_category_urls(source):
-            source.set_category_urls()
-
        s = Source('http://yahoo.com')
        s.download()
        s.parse()
+        s.set_categories()

-        wrap_category_urls(s)
-        saved_urls = s.get_category_urls()
+        saved_urls = s.category_urls()
+        s.categories = [] # reset and try again with caching

-        s.category_urls = [] # reset and try again with caching
-        wrap_category_urls(s)
+        s.set_categories()

-        assert sorted(s.get_category_urls()) == sorted(saved_urls)
-        # print '[CATEGORIES]', s.get_category_urls()
+        assert sorted(s.category_urls()) == sorted(saved_urls)
+        # print '[CATEGORIES]', s.category_urls()

 class UrlTestCase(unittest.TestCase):
    def runTest(self):
@ -223,6 +220,8 @@ class APITestCase(unittest.TestCase):
        print 'testing API unit'
        self.test_source_build()
        self.test_article_build()
+        self.test_hot_trending()
+        self.test_popular_urls()

    @print_test
    def test_source_build(self):
@ -243,14 +242,15 @@ class APITestCase(unittest.TestCase):

    @print_test
    def test_hot_trending(self):
-        """grab google trending"""
+        """grab google trending, just make sure this runs"""

-        print newspaper.hot()
+        newspaper.hot()

    @print_test
    def test_popular_urls(self):
+        """just make sure this runs"""

-        print newspaper.popular_urls()
+        newspaper.popular_urls()

 if __name__ == '__main__':
    # unittest.main() # run all units and their cases