Tests passing (almost)

The library seems to be updated and working. Two tests `test_nlp_body` and `test_parse_html` fail due to randomness in the results. The assertions for the summary, keywords, and authors fail since the results are non-deterministic.
2025-12-23 05:36:50 +00:00 · 2014-11-11 20:11:37 -07:00 · 2014-11-11 20:11:37 -07:00 · 32df727166
commit 32df727166
parent 1579c10002
11 changed files with 46 additions and 42 deletions
--- a/newspaper/cleaners.py
+++ b/newspaper/cleaners.py
@ -170,7 +170,7 @@ class DocumentCleaner(object):
                    and self.parser.getTag(next_node) == "a" \
                    and self.parser.getAttribute(
                        next_node, 'grv-usedalready') != 'yes':
-                outer = " " + self.parser.outerHtml(next_node) + " "
+                outer = " " + self.parser.outerHtml(next_node).decode('utf-8') + " "
                replacement_text.append(outer)
                nodes_to_remove.append(next_node)
                self.parser.setAttribute(next_node, attr='grv-usedalready',
--- a/newspaper/extractors.py
+++ b/newspaper/extractors.py
@ -418,7 +418,7 @@ class ContentExtractor(object):
            return []
        # If we are extracting from raw text
        if regex:
-            doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html)
+            doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html.decode('utf-8'))
            doc_or_html = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
                '(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
@ -762,7 +762,7 @@ class ContentExtractor(object):
        current_score = 0
        score_string = self.parser.getAttribute(node, 'gravityScore')
        if score_string:
-            current_score = int(score_string)
+            current_score = float(score_string)

        new_score = current_score + addToScore
        self.parser.setAttribute(node, "gravityScore", str(new_score))
@ -813,7 +813,7 @@ class ContentExtractor(object):
        grvScoreString = self.parser.getAttribute(node, 'gravityScore')
        if not grvScoreString:
            return None
-        return int(grvScoreString)
+        return float(grvScoreString)

    def nodes_to_check(self, doc):
        """Returns a list of nodes we want to search
--- a/newspaper/images.py
+++ b/newspaper/images.py
@ -78,7 +78,7 @@ def clean_url(url):
    """Url quotes unicode data out of urls
    """
    url = url.encode('utf8')
-    url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url])
+    url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url.decode('utf-8')])
    return url


@ -136,7 +136,7 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
                            print('we caught a favicon!: %s' % url)
                        else:
                            # import traceback
-                            # print traceback.format_exc()
+                            # print(traceback.format_exc())
                            print('PIL feed() failure for image:', url, str(e))
                            raise e
                        p = None
--- a/newspaper/nlp.py
+++ b/newspaper/nlp.py
@ -112,23 +112,26 @@ def keywords(text):
    """
    text = split_words(text)
    # of words before removing blacklist words
-    num_words = len(text)
-    text = [x for x in text if x not in stopwords]
-    freq = Counter()
-    for word in text:
-        freq[word] += 1
+    if text:
+        num_words = len(text)
+        text = [x for x in text if x not in stopwords]
+        freq = Counter()
+        for word in text:
+            freq[word] += 1

-    min_size = min(10, len(freq))
-    keywords = tuple(freq.most_common(min_size))
-    keywords = dict((x, y) for x, y in keywords)
+        min_size = min(10, len(freq))
+        keywords = tuple(freq.most_common(min_size))
+        keywords = dict((x, y) for x, y in keywords)

-    for k in keywords:
-        articleScore = keywords[k]*1.0 / max(num_words, 1)
-        keywords[k] = articleScore * 1.5 + 1
+        for k in keywords:
+            articleScore = keywords[k]*1.0 / max(num_words, 1)
+            keywords[k] = articleScore * 1.5 + 1

-    keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
-    keywords.reverse()
-    return dict(keywords)
+        keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
+        keywords.reverse()
+        return dict(keywords)
+    else:
+        return dict()


 def split_sentences(text):
@ -137,7 +140,7 @@ def split_sentences(text):
    import nltk.data
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

-    sentences = tokenizer.tokenize(text)
+    sentences = tokenizer.tokenize(text.decode('utf-8'))
    sentences = [x.replace('\n', '') for x in sentences if len(x) > 10]
    return sentences

@ -147,12 +150,15 @@ def length_score(sentence_len):


 def title_score(title, sentence):
-    title = [x for x in title if x not in stopwords]
-    count = 0.0
-    for word in sentence:
-        if (word not in stopwords and word in title):
-            count += 1.0
-    return count / max(len(title), 1)
+    if title:
+        title = [x for x in title if x not in stopwords]
+        count = 0.0
+        for word in sentence:
+            if (word not in stopwords and word in title):
+                count += 1.0
+        return count / max(len(title), 1)
+    else:
+        return 0


 def sentence_position(i, size):
--- a/newspaper/source.py
+++ b/newspaper/source.py
@ -326,8 +326,8 @@ class Source(object):
            self.articles = [a for a in self.articles if a.html]
        else:
            if threads > 5:
-                print ('Using 5+ threads on a single source '
-                       'may get you rate limited!')
+                print(('Using 5+ threads on a single source '
+                       'may get you rate limited!'))
            filled_requests = network.multithread_request(urls, self.config)
            # Note that the responses are returned in original order
            for index, req in enumerate(filled_requests):
--- a/newspaper/text.py
+++ b/newspaper/text.py
@ -76,11 +76,9 @@ class StopWords(object):
        content_is_unicode = isinstance(content, str)
        if content_is_unicode:
            content = content.encode('utf-8')
-        stripped_input = content.translate(
-            self.TRANS_TABLE, string.punctuation)
+        trans_table = {ord(c): None for c in string.punctuation}
+        stripped_input = content.decode('utf-8').translate(trans_table)

-        if content_is_unicode:
-            return stripped_input.decode('utf-8')
        return stripped_input

    def candidate_words(self, stripped_input):
--- a/newspaper/urls.py
+++ b/newspaper/urls.py
@ -88,7 +88,7 @@ def prepare_url(url, source_url=None):
            proper_url = remove_args(url)
    except ValueError as e:
        log.critical('url %s failed on err %s' % (url, str(e)))
-        # print 'url %s failed on err %s' % (url, str(e))
+        # print('url %s failed on err %s' % (url, str(e)))
        proper_url = ''

    return proper_url
--- a/newspaper/utils/init.py
+++ b/newspaper/utils/init.py
@ -196,7 +196,7 @@ def cache_disk(seconds=(86400*5), cache_folder="/tmp"):
            """Calculate a cache key based on the decorated method signature
            args[1] indicates the domain of the inputs, we hash on domain!
            """
-            key = sha1(str(args[1]) + str(kwargs)).hexdigest()
+            key = sha1((str(args[1]) + str(kwargs)).encode('utf-8')).hexdigest()
            filepath = os.path.join(cache_folder, key)

            # verify that the cached object exists and is less than
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ six==1.7.3
 feedparser==5.1.3
 tldextract==1.5.1
 feedfinder2==0.0.1
-jieba==0.34
+-e git+https://github.com/fxsjy/jieba.git@jieba3k#egg=jieba
--- a/tests/benchmarks.py
+++ b/tests/benchmarks.py
@ -58,7 +58,7 @@ def asyncio_run(urls):
    pass
    # rs = (grequests.request('GET', u, **req_kwargs) for u in urls)
    # responses = async_request(urls)
-    # print responses
+    # print(responses)


 def benchmark():
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@ -88,7 +88,7 @@ class ArticleTestCase(unittest.TestCase):
    def test_download_html(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.download()
-        assert len(self.article.html) == 75244
+        assert len(self.article.html) == 75176

    @print_test
    def test_pre_download_parse(self):
@ -209,8 +209,8 @@ class ArticleTestCase(unittest.TestCase):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()
        self.article.nlp()
-        # print self.article.summary
-        # print self.article.keywords
+        # print(self.article.summary)
+        # print(self.article.keywords)
        assert self.article.summary == SUMMARY
        assert self.article.keywords == KEYWORDS

@ -388,8 +388,8 @@ class EncodingTestCase(unittest.TestCase):

    @print_test
    def test_smart_str(self):
-        assert smart_str(self.uni_string) == "∆ˆˆø∆ßåßlucas yang˜"
-        assert smart_str(self.normal_string) == "∆ƒˆƒ´´lucas yang"
+        assert smart_str(self.uni_string) == b'\xe2\x88\x86\xcb\x86\xcb\x86\xc3\xb8\xe2\x88\x86\xc3\x9f\xc3\xa5\xc3\x9flucas yang\xcb\x9c'
+        assert smart_str(self.normal_string) == b'\xe2\x88\x86\xc6\x92\xcb\x86\xc6\x92\xc2\xb4\xc2\xb4lucas yang'


 class MThreadingTestCase(unittest.TestCase):