From 32df727166dc81eea151f657d44fd92857329976 Mon Sep 17 00:00:00 2001
From: Paul English <paul@onfrst.com>
Date: Tue, 11 Nov 2014 20:11:37 -0700
Subject: [PATCH] Tests passing (almost)

The library seems to be updated and working. Two tests `test_nlp_body`
and `test_parse_html` fail due to randomness in the results. The
assertions for the summary, keywords, and authors fail since the results
are non-deterministic.
---
 newspaper/cleaners.py       |  2 +-
 newspaper/extractors.py     |  6 ++---
 newspaper/images.py         |  4 ++--
 newspaper/nlp.py            | 48 +++++++++++++++++++++----------------
 newspaper/source.py         |  4 ++--
 newspaper/text.py           |  6 ++---
 newspaper/urls.py           |  2 +-
 newspaper/utils/__init__.py |  2 +-
 requirements.txt            |  2 +-
 tests/benchmarks.py         |  2 +-
 tests/unit_tests.py         | 10 ++++----
 11 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/newspaper/cleaners.py b/newspaper/cleaners.py
index 5de87b0..d9b8409 100644
--- a/newspaper/cleaners.py
+++ b/newspaper/cleaners.py
@@ -170,7 +170,7 @@ class DocumentCleaner(object):
                     and self.parser.getTag(next_node) == "a" \
                     and self.parser.getAttribute(
                         next_node, 'grv-usedalready') != 'yes':
-                outer = " " + self.parser.outerHtml(next_node) + " "
+                outer = " " + self.parser.outerHtml(next_node).decode('utf-8') + " "
                 replacement_text.append(outer)
                 nodes_to_remove.append(next_node)
                 self.parser.setAttribute(next_node, attr='grv-usedalready',
diff --git a/newspaper/extractors.py b/newspaper/extractors.py
index 2c5ed37..32f90ff 100644
--- a/newspaper/extractors.py
+++ b/newspaper/extractors.py
@@ -418,7 +418,7 @@ class ContentExtractor(object):
             return []
         # If we are extracting from raw text
         if regex:
-            doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html)
+            doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html.decode('utf-8'))
             doc_or_html = re.findall(
                 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
                 '(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
@@ -762,7 +762,7 @@ class ContentExtractor(object):
         current_score = 0
         score_string = self.parser.getAttribute(node, 'gravityScore')
         if score_string:
-            current_score = int(score_string)
+            current_score = float(score_string)
 
         new_score = current_score + addToScore
         self.parser.setAttribute(node, "gravityScore", str(new_score))
@@ -813,7 +813,7 @@ class ContentExtractor(object):
         grvScoreString = self.parser.getAttribute(node, 'gravityScore')
         if not grvScoreString:
             return None
-        return int(grvScoreString)
+        return float(grvScoreString)
 
     def nodes_to_check(self, doc):
         """Returns a list of nodes we want to search
diff --git a/newspaper/images.py b/newspaper/images.py
index 612267f..0911508 100644
--- a/newspaper/images.py
+++ b/newspaper/images.py
@@ -78,7 +78,7 @@ def clean_url(url):
     """Url quotes unicode data out of urls
     """
     url = url.encode('utf8')
-    url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url])
+    url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url.decode('utf-8')])
     return url
 
 
@@ -136,7 +136,7 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
                             print('we caught a favicon!: %s' % url)
                         else:
                             # import traceback
-                            # print traceback.format_exc()
+                            # print(traceback.format_exc())
                             print('PIL feed() failure for image:', url, str(e))
                             raise e
                         p = None
diff --git a/newspaper/nlp.py b/newspaper/nlp.py
index 5b26cac..17e3fce 100644
--- a/newspaper/nlp.py
+++ b/newspaper/nlp.py
@@ -112,23 +112,26 @@ def keywords(text):
     """
     text = split_words(text)
     # of words before removing blacklist words
-    num_words = len(text)
-    text = [x for x in text if x not in stopwords]
-    freq = Counter()
-    for word in text:
-        freq[word] += 1
+    if text:
+        num_words = len(text)
+        text = [x for x in text if x not in stopwords]
+        freq = Counter()
+        for word in text:
+            freq[word] += 1
 
-    min_size = min(10, len(freq))
-    keywords = tuple(freq.most_common(min_size))
-    keywords = dict((x, y) for x, y in keywords)
+        min_size = min(10, len(freq))
+        keywords = tuple(freq.most_common(min_size))
+        keywords = dict((x, y) for x, y in keywords)
 
-    for k in keywords:
-        articleScore = keywords[k]*1.0 / max(num_words, 1)
-        keywords[k] = articleScore * 1.5 + 1
+        for k in keywords:
+            articleScore = keywords[k]*1.0 / max(num_words, 1)
+            keywords[k] = articleScore * 1.5 + 1
 
-    keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
-    keywords.reverse()
-    return dict(keywords)
+        keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
+        keywords.reverse()
+        return dict(keywords)
+    else:
+        return dict()
 
 
 def split_sentences(text):
@@ -137,7 +140,7 @@ def split_sentences(text):
     import nltk.data
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 
-    sentences = tokenizer.tokenize(text)
+    sentences = tokenizer.tokenize(text.decode('utf-8'))
     sentences = [x.replace('\n', '') for x in sentences if len(x) > 10]
     return sentences
 
@@ -147,12 +150,15 @@ def length_score(sentence_len):
 
 
 def title_score(title, sentence):
-    title = [x for x in title if x not in stopwords]
-    count = 0.0
-    for word in sentence:
-        if (word not in stopwords and word in title):
-            count += 1.0
-    return count / max(len(title), 1)
+    if title:
+        title = [x for x in title if x not in stopwords]
+        count = 0.0
+        for word in sentence:
+            if (word not in stopwords and word in title):
+                count += 1.0
+        return count / max(len(title), 1)
+    else:
+        return 0
 
 
 def sentence_position(i, size):
diff --git a/newspaper/source.py b/newspaper/source.py
index ab58991..56d5e3d 100644
--- a/newspaper/source.py
+++ b/newspaper/source.py
@@ -326,8 +326,8 @@ class Source(object):
             self.articles = [a for a in self.articles if a.html]
         else:
             if threads > 5:
-                print ('Using 5+ threads on a single source '
-                       'may get you rate limited!')
+                print(('Using 5+ threads on a single source '
+                       'may get you rate limited!'))
             filled_requests = network.multithread_request(urls, self.config)
             # Note that the responses are returned in original order
             for index, req in enumerate(filled_requests):
diff --git a/newspaper/text.py b/newspaper/text.py
index 8b7ff33..d2bbc5a 100644
--- a/newspaper/text.py
+++ b/newspaper/text.py
@@ -76,11 +76,9 @@ class StopWords(object):
         content_is_unicode = isinstance(content, str)
         if content_is_unicode:
             content = content.encode('utf-8')
-        stripped_input = content.translate(
-            self.TRANS_TABLE, string.punctuation)
+        trans_table = {ord(c): None for c in string.punctuation}
+        stripped_input = content.decode('utf-8').translate(trans_table)
 
-        if content_is_unicode:
-            return stripped_input.decode('utf-8')
         return stripped_input
 
     def candidate_words(self, stripped_input):
diff --git a/newspaper/urls.py b/newspaper/urls.py
index d67f1f5..e83c319 100644
--- a/newspaper/urls.py
+++ b/newspaper/urls.py
@@ -88,7 +88,7 @@ def prepare_url(url, source_url=None):
             proper_url = remove_args(url)
     except ValueError as e:
         log.critical('url %s failed on err %s' % (url, str(e)))
-        # print 'url %s failed on err %s' % (url, str(e))
+        # print('url %s failed on err %s' % (url, str(e)))
         proper_url = ''
 
     return proper_url
diff --git a/newspaper/utils/__init__.py b/newspaper/utils/__init__.py
index b844cf2..1bc7db8 100644
--- a/newspaper/utils/__init__.py
+++ b/newspaper/utils/__init__.py
@@ -196,7 +196,7 @@ def cache_disk(seconds=(86400*5), cache_folder="/tmp"):
             """Calculate a cache key based on the decorated method signature
             args[1] indicates the domain of the inputs, we hash on domain!
             """
-            key = sha1(str(args[1]) + str(kwargs)).hexdigest()
+            key = sha1((str(args[1]) + str(kwargs)).encode('utf-8')).hexdigest()
             filepath = os.path.join(cache_folder, key)
 
             # verify that the cached object exists and is less than
diff --git a/requirements.txt b/requirements.txt
index 3092f91..9b936c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,4 @@ six==1.7.3
 feedparser==5.1.3
 tldextract==1.5.1
 feedfinder2==0.0.1
-jieba==0.34
+-e git+https://github.com/fxsjy/jieba.git@jieba3k#egg=jieba
diff --git a/tests/benchmarks.py b/tests/benchmarks.py
index afd4d0d..ea3a0e2 100644
--- a/tests/benchmarks.py
+++ b/tests/benchmarks.py
@@ -58,7 +58,7 @@ def asyncio_run(urls):
     pass
     # rs = (grequests.request('GET', u, **req_kwargs) for u in urls)
     # responses = async_request(urls)
-    # print responses
+    # print(responses)
 
 
 def benchmark():
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 445c06b..a915b51 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -88,7 +88,7 @@ class ArticleTestCase(unittest.TestCase):
     def test_download_html(self):
         mock_response_with(self.article.url, 'cnn_article')
         self.article.download()
-        assert len(self.article.html) == 75244
+        assert len(self.article.html) == 75176
 
     @print_test
     def test_pre_download_parse(self):
@@ -209,8 +209,8 @@ class ArticleTestCase(unittest.TestCase):
         mock_response_with(self.article.url, 'cnn_article')
         self.article.build()
         self.article.nlp()
-        # print self.article.summary
-        # print self.article.keywords
+        # print(self.article.summary)
+        # print(self.article.keywords)
         assert self.article.summary == SUMMARY
         assert self.article.keywords == KEYWORDS
 
@@ -388,8 +388,8 @@ class EncodingTestCase(unittest.TestCase):
 
     @print_test
     def test_smart_str(self):
-        assert smart_str(self.uni_string) == "∆ˆˆø∆ßåßlucas yang˜"
-        assert smart_str(self.normal_string) == "∆ƒˆƒ´´lucas yang"
+        assert smart_str(self.uni_string) == b'\xe2\x88\x86\xcb\x86\xcb\x86\xc3\xb8\xe2\x88\x86\xc3\x9f\xc3\xa5\xc3\x9flucas yang\xcb\x9c'
+        assert smart_str(self.normal_string) == b'\xe2\x88\x86\xc6\x92\xcb\x86\xc6\x92\xc2\xb4\xc2\xb4lucas yang'
 
 
 class MThreadingTestCase(unittest.TestCase):