mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
Tests passing (almost)
The library seems to be updated and working. Two tests `test_nlp_body` and `test_parse_html` fail due to randomness in the results. The assertions for the summary, keywords, and authors fail since the results are non-deterministic.
This commit is contained in:
parent
1579c10002
commit
32df727166
11 changed files with 46 additions and 42 deletions
|
|
@ -170,7 +170,7 @@ class DocumentCleaner(object):
|
|||
and self.parser.getTag(next_node) == "a" \
|
||||
and self.parser.getAttribute(
|
||||
next_node, 'grv-usedalready') != 'yes':
|
||||
outer = " " + self.parser.outerHtml(next_node) + " "
|
||||
outer = " " + self.parser.outerHtml(next_node).decode('utf-8') + " "
|
||||
replacement_text.append(outer)
|
||||
nodes_to_remove.append(next_node)
|
||||
self.parser.setAttribute(next_node, attr='grv-usedalready',
|
||||
|
|
|
|||
|
|
@ -418,7 +418,7 @@ class ContentExtractor(object):
|
|||
return []
|
||||
# If we are extracting from raw text
|
||||
if regex:
|
||||
doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html)
|
||||
doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html.decode('utf-8'))
|
||||
doc_or_html = re.findall(
|
||||
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
|
||||
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
|
||||
|
|
@ -762,7 +762,7 @@ class ContentExtractor(object):
|
|||
current_score = 0
|
||||
score_string = self.parser.getAttribute(node, 'gravityScore')
|
||||
if score_string:
|
||||
current_score = int(score_string)
|
||||
current_score = float(score_string)
|
||||
|
||||
new_score = current_score + addToScore
|
||||
self.parser.setAttribute(node, "gravityScore", str(new_score))
|
||||
|
|
@ -813,7 +813,7 @@ class ContentExtractor(object):
|
|||
grvScoreString = self.parser.getAttribute(node, 'gravityScore')
|
||||
if not grvScoreString:
|
||||
return None
|
||||
return int(grvScoreString)
|
||||
return float(grvScoreString)
|
||||
|
||||
def nodes_to_check(self, doc):
|
||||
"""Returns a list of nodes we want to search
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ def clean_url(url):
|
|||
"""Url quotes unicode data out of urls
|
||||
"""
|
||||
url = url.encode('utf8')
|
||||
url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url])
|
||||
url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url.decode('utf-8')])
|
||||
return url
|
||||
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
|
|||
print('we caught a favicon!: %s' % url)
|
||||
else:
|
||||
# import traceback
|
||||
# print traceback.format_exc()
|
||||
# print(traceback.format_exc())
|
||||
print('PIL feed() failure for image:', url, str(e))
|
||||
raise e
|
||||
p = None
|
||||
|
|
|
|||
|
|
@ -112,23 +112,26 @@ def keywords(text):
|
|||
"""
|
||||
text = split_words(text)
|
||||
# of words before removing blacklist words
|
||||
num_words = len(text)
|
||||
text = [x for x in text if x not in stopwords]
|
||||
freq = Counter()
|
||||
for word in text:
|
||||
freq[word] += 1
|
||||
if text:
|
||||
num_words = len(text)
|
||||
text = [x for x in text if x not in stopwords]
|
||||
freq = Counter()
|
||||
for word in text:
|
||||
freq[word] += 1
|
||||
|
||||
min_size = min(10, len(freq))
|
||||
keywords = tuple(freq.most_common(min_size))
|
||||
keywords = dict((x, y) for x, y in keywords)
|
||||
min_size = min(10, len(freq))
|
||||
keywords = tuple(freq.most_common(min_size))
|
||||
keywords = dict((x, y) for x, y in keywords)
|
||||
|
||||
for k in keywords:
|
||||
articleScore = keywords[k]*1.0 / max(num_words, 1)
|
||||
keywords[k] = articleScore * 1.5 + 1
|
||||
for k in keywords:
|
||||
articleScore = keywords[k]*1.0 / max(num_words, 1)
|
||||
keywords[k] = articleScore * 1.5 + 1
|
||||
|
||||
keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
|
||||
keywords.reverse()
|
||||
return dict(keywords)
|
||||
keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
|
||||
keywords.reverse()
|
||||
return dict(keywords)
|
||||
else:
|
||||
return dict()
|
||||
|
||||
|
||||
def split_sentences(text):
|
||||
|
|
@ -137,7 +140,7 @@ def split_sentences(text):
|
|||
import nltk.data
|
||||
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
||||
|
||||
sentences = tokenizer.tokenize(text)
|
||||
sentences = tokenizer.tokenize(text.decode('utf-8'))
|
||||
sentences = [x.replace('\n', '') for x in sentences if len(x) > 10]
|
||||
return sentences
|
||||
|
||||
|
|
@ -147,12 +150,15 @@ def length_score(sentence_len):
|
|||
|
||||
|
||||
def title_score(title, sentence):
|
||||
title = [x for x in title if x not in stopwords]
|
||||
count = 0.0
|
||||
for word in sentence:
|
||||
if (word not in stopwords and word in title):
|
||||
count += 1.0
|
||||
return count / max(len(title), 1)
|
||||
if title:
|
||||
title = [x for x in title if x not in stopwords]
|
||||
count = 0.0
|
||||
for word in sentence:
|
||||
if (word not in stopwords and word in title):
|
||||
count += 1.0
|
||||
return count / max(len(title), 1)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def sentence_position(i, size):
|
||||
|
|
|
|||
|
|
@ -326,8 +326,8 @@ class Source(object):
|
|||
self.articles = [a for a in self.articles if a.html]
|
||||
else:
|
||||
if threads > 5:
|
||||
print ('Using 5+ threads on a single source '
|
||||
'may get you rate limited!')
|
||||
print(('Using 5+ threads on a single source '
|
||||
'may get you rate limited!'))
|
||||
filled_requests = network.multithread_request(urls, self.config)
|
||||
# Note that the responses are returned in original order
|
||||
for index, req in enumerate(filled_requests):
|
||||
|
|
|
|||
|
|
@ -76,11 +76,9 @@ class StopWords(object):
|
|||
content_is_unicode = isinstance(content, str)
|
||||
if content_is_unicode:
|
||||
content = content.encode('utf-8')
|
||||
stripped_input = content.translate(
|
||||
self.TRANS_TABLE, string.punctuation)
|
||||
trans_table = {ord(c): None for c in string.punctuation}
|
||||
stripped_input = content.decode('utf-8').translate(trans_table)
|
||||
|
||||
if content_is_unicode:
|
||||
return stripped_input.decode('utf-8')
|
||||
return stripped_input
|
||||
|
||||
def candidate_words(self, stripped_input):
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ def prepare_url(url, source_url=None):
|
|||
proper_url = remove_args(url)
|
||||
except ValueError as e:
|
||||
log.critical('url %s failed on err %s' % (url, str(e)))
|
||||
# print 'url %s failed on err %s' % (url, str(e))
|
||||
# print('url %s failed on err %s' % (url, str(e)))
|
||||
proper_url = ''
|
||||
|
||||
return proper_url
|
||||
|
|
|
|||
|
|
@ -196,7 +196,7 @@ def cache_disk(seconds=(86400*5), cache_folder="/tmp"):
|
|||
"""Calculate a cache key based on the decorated method signature
|
||||
args[1] indicates the domain of the inputs, we hash on domain!
|
||||
"""
|
||||
key = sha1(str(args[1]) + str(kwargs)).hexdigest()
|
||||
key = sha1((str(args[1]) + str(kwargs)).encode('utf-8')).hexdigest()
|
||||
filepath = os.path.join(cache_folder, key)
|
||||
|
||||
# verify that the cached object exists and is less than
|
||||
|
|
|
|||
|
|
@ -10,4 +10,4 @@ six==1.7.3
|
|||
feedparser==5.1.3
|
||||
tldextract==1.5.1
|
||||
feedfinder2==0.0.1
|
||||
jieba==0.34
|
||||
-e git+https://github.com/fxsjy/jieba.git@jieba3k#egg=jieba
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ def asyncio_run(urls):
|
|||
pass
|
||||
# rs = (grequests.request('GET', u, **req_kwargs) for u in urls)
|
||||
# responses = async_request(urls)
|
||||
# print responses
|
||||
# print(responses)
|
||||
|
||||
|
||||
def benchmark():
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ class ArticleTestCase(unittest.TestCase):
|
|||
def test_download_html(self):
|
||||
mock_response_with(self.article.url, 'cnn_article')
|
||||
self.article.download()
|
||||
assert len(self.article.html) == 75244
|
||||
assert len(self.article.html) == 75176
|
||||
|
||||
@print_test
|
||||
def test_pre_download_parse(self):
|
||||
|
|
@ -209,8 +209,8 @@ class ArticleTestCase(unittest.TestCase):
|
|||
mock_response_with(self.article.url, 'cnn_article')
|
||||
self.article.build()
|
||||
self.article.nlp()
|
||||
# print self.article.summary
|
||||
# print self.article.keywords
|
||||
# print(self.article.summary)
|
||||
# print(self.article.keywords)
|
||||
assert self.article.summary == SUMMARY
|
||||
assert self.article.keywords == KEYWORDS
|
||||
|
||||
|
|
@ -388,8 +388,8 @@ class EncodingTestCase(unittest.TestCase):
|
|||
|
||||
@print_test
|
||||
def test_smart_str(self):
|
||||
assert smart_str(self.uni_string) == "∆ˆˆø∆ßåßlucas yang˜"
|
||||
assert smart_str(self.normal_string) == "∆ƒˆƒ´´lucas yang"
|
||||
assert smart_str(self.uni_string) == b'\xe2\x88\x86\xcb\x86\xcb\x86\xc3\xb8\xe2\x88\x86\xc3\x9f\xc3\xa5\xc3\x9flucas yang\xcb\x9c'
|
||||
assert smart_str(self.normal_string) == b'\xe2\x88\x86\xc6\x92\xcb\x86\xc6\x92\xc2\xb4\xc2\xb4lucas yang'
|
||||
|
||||
|
||||
class MThreadingTestCase(unittest.TestCase):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue