From 32df727166dc81eea151f657d44fd92857329976 Mon Sep 17 00:00:00 2001 From: Paul English Date: Tue, 11 Nov 2014 20:11:37 -0700 Subject: [PATCH] Tests passing (almost) The library seems to be updated and working. Two tests `test_nlp_body` and `test_parse_html` fail due to randomness in the results. The assertions for the summary, keywords, and authors fail since the results are non-deterministic. --- newspaper/cleaners.py | 2 +- newspaper/extractors.py | 6 ++--- newspaper/images.py | 4 ++-- newspaper/nlp.py | 48 +++++++++++++++++++++---------------- newspaper/source.py | 4 ++-- newspaper/text.py | 6 ++--- newspaper/urls.py | 2 +- newspaper/utils/__init__.py | 2 +- requirements.txt | 2 +- tests/benchmarks.py | 2 +- tests/unit_tests.py | 10 ++++---- 11 files changed, 46 insertions(+), 42 deletions(-) diff --git a/newspaper/cleaners.py b/newspaper/cleaners.py index 5de87b0..d9b8409 100644 --- a/newspaper/cleaners.py +++ b/newspaper/cleaners.py @@ -170,7 +170,7 @@ class DocumentCleaner(object): and self.parser.getTag(next_node) == "a" \ and self.parser.getAttribute( next_node, 'grv-usedalready') != 'yes': - outer = " " + self.parser.outerHtml(next_node) + " " + outer = " " + self.parser.outerHtml(next_node).decode('utf-8') + " " replacement_text.append(outer) nodes_to_remove.append(next_node) self.parser.setAttribute(next_node, attr='grv-usedalready', diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 2c5ed37..32f90ff 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -418,7 +418,7 @@ class ContentExtractor(object): return [] # If we are extracting from raw text if regex: - doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html) + doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html.decode('utf-8')) doc_or_html = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|' '(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html) @@ -762,7 +762,7 @@ class ContentExtractor(object): current_score = 0 score_string = self.parser.getAttribute(node, 'gravityScore') if score_string: - current_score = int(score_string) + current_score = float(score_string) new_score = current_score + addToScore self.parser.setAttribute(node, "gravityScore", str(new_score)) @@ -813,7 +813,7 @@ class ContentExtractor(object): grvScoreString = self.parser.getAttribute(node, 'gravityScore') if not grvScoreString: return None - return int(grvScoreString) + return float(grvScoreString) def nodes_to_check(self, doc): """Returns a list of nodes we want to search diff --git a/newspaper/images.py b/newspaper/images.py index 612267f..0911508 100644 --- a/newspaper/images.py +++ b/newspaper/images.py @@ -78,7 +78,7 @@ def clean_url(url): """Url quotes unicode data out of urls """ url = url.encode('utf8') - url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url]) + url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url.decode('utf-8')]) return url @@ -136,7 +136,7 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False): print('we caught a favicon!: %s' % url) else: # import traceback - # print traceback.format_exc() + # print(traceback.format_exc()) print('PIL feed() failure for image:', url, str(e)) raise e p = None diff --git a/newspaper/nlp.py b/newspaper/nlp.py index 5b26cac..17e3fce 100644 --- a/newspaper/nlp.py +++ b/newspaper/nlp.py @@ -112,23 +112,26 @@ def keywords(text): """ text = split_words(text) # of words before removing blacklist words - num_words = len(text) - text = [x for x in text if x not in stopwords] - freq = Counter() - for word in text: - freq[word] += 1 + if text: + num_words = len(text) + text = [x for x in text if x not in stopwords] + freq = Counter() + for word in text: + freq[word] += 1 - min_size = min(10, len(freq)) - keywords = tuple(freq.most_common(min_size)) - keywords = dict((x, y) for x, y in keywords) + min_size = min(10, len(freq)) + keywords = tuple(freq.most_common(min_size)) + keywords = dict((x, y) for x, y in keywords) - for k in keywords: - articleScore = keywords[k]*1.0 / max(num_words, 1) - keywords[k] = articleScore * 1.5 + 1 + for k in keywords: + articleScore = keywords[k]*1.0 / max(num_words, 1) + keywords[k] = articleScore * 1.5 + 1 - keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1)) - keywords.reverse() - return dict(keywords) + keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1)) + keywords.reverse() + return dict(keywords) + else: + return dict() def split_sentences(text): @@ -137,7 +140,7 @@ def split_sentences(text): import nltk.data tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') - sentences = tokenizer.tokenize(text) + sentences = tokenizer.tokenize(text.decode('utf-8')) sentences = [x.replace('\n', '') for x in sentences if len(x) > 10] return sentences @@ -147,12 +150,15 @@ def length_score(sentence_len): def title_score(title, sentence): - title = [x for x in title if x not in stopwords] - count = 0.0 - for word in sentence: - if (word not in stopwords and word in title): - count += 1.0 - return count / max(len(title), 1) + if title: + title = [x for x in title if x not in stopwords] + count = 0.0 + for word in sentence: + if (word not in stopwords and word in title): + count += 1.0 + return count / max(len(title), 1) + else: + return 0 def sentence_position(i, size): diff --git a/newspaper/source.py b/newspaper/source.py index ab58991..56d5e3d 100644 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -326,8 +326,8 @@ class Source(object): self.articles = [a for a in self.articles if a.html] else: if threads > 5: - print ('Using 5+ threads on a single source ' - 'may get you rate limited!') + print(('Using 5+ threads on a single source ' + 'may get you rate limited!')) filled_requests = network.multithread_request(urls, self.config) # Note that the responses are returned in original order for index, req in enumerate(filled_requests): diff --git a/newspaper/text.py b/newspaper/text.py index 8b7ff33..d2bbc5a 100644 --- a/newspaper/text.py +++ b/newspaper/text.py @@ -76,11 +76,9 @@ class StopWords(object): content_is_unicode = isinstance(content, str) if content_is_unicode: content = content.encode('utf-8') - stripped_input = content.translate( - self.TRANS_TABLE, string.punctuation) + trans_table = {ord(c): None for c in string.punctuation} + stripped_input = content.decode('utf-8').translate(trans_table) - if content_is_unicode: - return stripped_input.decode('utf-8') return stripped_input def candidate_words(self, stripped_input): diff --git a/newspaper/urls.py b/newspaper/urls.py index d67f1f5..e83c319 100644 --- a/newspaper/urls.py +++ b/newspaper/urls.py @@ -88,7 +88,7 @@ def prepare_url(url, source_url=None): proper_url = remove_args(url) except ValueError as e: log.critical('url %s failed on err %s' % (url, str(e))) - # print 'url %s failed on err %s' % (url, str(e)) + # print('url %s failed on err %s' % (url, str(e))) proper_url = '' return proper_url diff --git a/newspaper/utils/__init__.py b/newspaper/utils/__init__.py index b844cf2..1bc7db8 100644 --- a/newspaper/utils/__init__.py +++ b/newspaper/utils/__init__.py @@ -196,7 +196,7 @@ def cache_disk(seconds=(86400*5), cache_folder="/tmp"): """Calculate a cache key based on the decorated method signature args[1] indicates the domain of the inputs, we hash on domain! """ - key = sha1(str(args[1]) + str(kwargs)).hexdigest() + key = sha1((str(args[1]) + str(kwargs)).encode('utf-8')).hexdigest() filepath = os.path.join(cache_folder, key) # verify that the cached object exists and is less than diff --git a/requirements.txt b/requirements.txt index 3092f91..9b936c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ six==1.7.3 feedparser==5.1.3 tldextract==1.5.1 feedfinder2==0.0.1 -jieba==0.34 +-e git+https://github.com/fxsjy/jieba.git@jieba3k#egg=jieba diff --git a/tests/benchmarks.py b/tests/benchmarks.py index afd4d0d..ea3a0e2 100644 --- a/tests/benchmarks.py +++ b/tests/benchmarks.py @@ -58,7 +58,7 @@ def asyncio_run(urls): pass # rs = (grequests.request('GET', u, **req_kwargs) for u in urls) # responses = async_request(urls) - # print responses + # print(responses) def benchmark(): diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 445c06b..a915b51 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -88,7 +88,7 @@ class ArticleTestCase(unittest.TestCase): def test_download_html(self): mock_response_with(self.article.url, 'cnn_article') self.article.download() - assert len(self.article.html) == 75244 + assert len(self.article.html) == 75176 @print_test def test_pre_download_parse(self): @@ -209,8 +209,8 @@ class ArticleTestCase(unittest.TestCase): mock_response_with(self.article.url, 'cnn_article') self.article.build() self.article.nlp() - # print self.article.summary - # print self.article.keywords + # print(self.article.summary) + # print(self.article.keywords) assert self.article.summary == SUMMARY assert self.article.keywords == KEYWORDS @@ -388,8 +388,8 @@ class EncodingTestCase(unittest.TestCase): @print_test def test_smart_str(self): - assert smart_str(self.uni_string) == "∆ˆˆø∆ßåßlucas yang˜" - assert smart_str(self.normal_string) == "∆ƒˆƒ´´lucas yang" + assert smart_str(self.uni_string) == b'\xe2\x88\x86\xcb\x86\xcb\x86\xc3\xb8\xe2\x88\x86\xc3\x9f\xc3\xa5\xc3\x9flucas yang\xcb\x9c' + assert smart_str(self.normal_string) == b'\xe2\x88\x86\xc6\x92\xcb\x86\xc6\x92\xc2\xb4\xc2\xb4lucas yang' class MThreadingTestCase(unittest.TestCase):