Tests passing (almost)

The library seems to be updated and working. Two tests `test_nlp_body`
and `test_parse_html` fail due to randomness in the results. The
assertions for the summary, keywords, and authors fail since the results
are non-deterministic.
This commit is contained in:
Paul English 2014-11-11 20:11:37 -07:00
parent 1579c10002
commit 32df727166
11 changed files with 46 additions and 42 deletions

View file

@ -170,7 +170,7 @@ class DocumentCleaner(object):
and self.parser.getTag(next_node) == "a" \
and self.parser.getAttribute(
next_node, 'grv-usedalready') != 'yes':
outer = " " + self.parser.outerHtml(next_node) + " "
outer = " " + self.parser.outerHtml(next_node).decode('utf-8') + " "
replacement_text.append(outer)
nodes_to_remove.append(next_node)
self.parser.setAttribute(next_node, attr='grv-usedalready',

View file

@ -418,7 +418,7 @@ class ContentExtractor(object):
return []
# If we are extracting from raw text
if regex:
doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html)
doc_or_html = re.sub('<[^<]+?>', ' ', doc_or_html.decode('utf-8'))
doc_or_html = re.findall(
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
@ -762,7 +762,7 @@ class ContentExtractor(object):
current_score = 0
score_string = self.parser.getAttribute(node, 'gravityScore')
if score_string:
current_score = int(score_string)
current_score = float(score_string)
new_score = current_score + addToScore
self.parser.setAttribute(node, "gravityScore", str(new_score))
@ -813,7 +813,7 @@ class ContentExtractor(object):
grvScoreString = self.parser.getAttribute(node, 'gravityScore')
if not grvScoreString:
return None
return int(grvScoreString)
return float(grvScoreString)
def nodes_to_check(self, doc):
"""Returns a list of nodes we want to search

View file

@ -78,7 +78,7 @@ def clean_url(url):
"""Url quotes unicode data out of urls
"""
url = url.encode('utf8')
url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url])
url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url.decode('utf-8')])
return url
@ -136,7 +136,7 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
print('we caught a favicon!: %s' % url)
else:
# import traceback
# print traceback.format_exc()
# print(traceback.format_exc())
print('PIL feed() failure for image:', url, str(e))
raise e
p = None

View file

@ -112,23 +112,26 @@ def keywords(text):
"""
text = split_words(text)
# of words before removing blacklist words
num_words = len(text)
text = [x for x in text if x not in stopwords]
freq = Counter()
for word in text:
freq[word] += 1
if text:
num_words = len(text)
text = [x for x in text if x not in stopwords]
freq = Counter()
for word in text:
freq[word] += 1
min_size = min(10, len(freq))
keywords = tuple(freq.most_common(min_size))
keywords = dict((x, y) for x, y in keywords)
min_size = min(10, len(freq))
keywords = tuple(freq.most_common(min_size))
keywords = dict((x, y) for x, y in keywords)
for k in keywords:
articleScore = keywords[k]*1.0 / max(num_words, 1)
keywords[k] = articleScore * 1.5 + 1
for k in keywords:
articleScore = keywords[k]*1.0 / max(num_words, 1)
keywords[k] = articleScore * 1.5 + 1
keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
keywords.reverse()
return dict(keywords)
keywords = sorted(iter(keywords.items()), key=operator.itemgetter(1))
keywords.reverse()
return dict(keywords)
else:
return dict()
def split_sentences(text):
@ -137,7 +140,7 @@ def split_sentences(text):
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = tokenizer.tokenize(text)
sentences = tokenizer.tokenize(text.decode('utf-8'))
sentences = [x.replace('\n', '') for x in sentences if len(x) > 10]
return sentences
@ -147,12 +150,15 @@ def length_score(sentence_len):
def title_score(title, sentence):
title = [x for x in title if x not in stopwords]
count = 0.0
for word in sentence:
if (word not in stopwords and word in title):
count += 1.0
return count / max(len(title), 1)
if title:
title = [x for x in title if x not in stopwords]
count = 0.0
for word in sentence:
if (word not in stopwords and word in title):
count += 1.0
return count / max(len(title), 1)
else:
return 0
def sentence_position(i, size):

View file

@ -326,8 +326,8 @@ class Source(object):
self.articles = [a for a in self.articles if a.html]
else:
if threads > 5:
print ('Using 5+ threads on a single source '
'may get you rate limited!')
print(('Using 5+ threads on a single source '
'may get you rate limited!'))
filled_requests = network.multithread_request(urls, self.config)
# Note that the responses are returned in original order
for index, req in enumerate(filled_requests):

View file

@ -76,11 +76,9 @@ class StopWords(object):
content_is_unicode = isinstance(content, str)
if content_is_unicode:
content = content.encode('utf-8')
stripped_input = content.translate(
self.TRANS_TABLE, string.punctuation)
trans_table = {ord(c): None for c in string.punctuation}
stripped_input = content.decode('utf-8').translate(trans_table)
if content_is_unicode:
return stripped_input.decode('utf-8')
return stripped_input
def candidate_words(self, stripped_input):

View file

@ -88,7 +88,7 @@ def prepare_url(url, source_url=None):
proper_url = remove_args(url)
except ValueError as e:
log.critical('url %s failed on err %s' % (url, str(e)))
# print 'url %s failed on err %s' % (url, str(e))
# print('url %s failed on err %s' % (url, str(e)))
proper_url = ''
return proper_url

View file

@ -196,7 +196,7 @@ def cache_disk(seconds=(86400*5), cache_folder="/tmp"):
"""Calculate a cache key based on the decorated method signature
args[1] indicates the domain of the inputs, we hash on domain!
"""
key = sha1(str(args[1]) + str(kwargs)).hexdigest()
key = sha1((str(args[1]) + str(kwargs)).encode('utf-8')).hexdigest()
filepath = os.path.join(cache_folder, key)
# verify that the cached object exists and is less than

View file

@ -10,4 +10,4 @@ six==1.7.3
feedparser==5.1.3
tldextract==1.5.1
feedfinder2==0.0.1
jieba==0.34
-e git+https://github.com/fxsjy/jieba.git@jieba3k#egg=jieba

View file

@ -58,7 +58,7 @@ def asyncio_run(urls):
pass
# rs = (grequests.request('GET', u, **req_kwargs) for u in urls)
# responses = async_request(urls)
# print responses
# print(responses)
def benchmark():

View file

@ -88,7 +88,7 @@ class ArticleTestCase(unittest.TestCase):
def test_download_html(self):
mock_response_with(self.article.url, 'cnn_article')
self.article.download()
assert len(self.article.html) == 75244
assert len(self.article.html) == 75176
@print_test
def test_pre_download_parse(self):
@ -209,8 +209,8 @@ class ArticleTestCase(unittest.TestCase):
mock_response_with(self.article.url, 'cnn_article')
self.article.build()
self.article.nlp()
# print self.article.summary
# print self.article.keywords
# print(self.article.summary)
# print(self.article.keywords)
assert self.article.summary == SUMMARY
assert self.article.keywords == KEYWORDS
@ -388,8 +388,8 @@ class EncodingTestCase(unittest.TestCase):
@print_test
def test_smart_str(self):
assert smart_str(self.uni_string) == "ˆˆø∆ßåßlucas yang˜"
assert smart_str(self.normal_string) == "∆ƒˆƒ´´lucas yang"
assert smart_str(self.uni_string) == b'\xe2\x88\x86\xcb\x86\xcb\x86\xc3\xb8\xe2\x88\x86\xc3\x9f\xc3\xa5\xc3\x9flucas yang\xcb\x9c'
assert smart_str(self.normal_string) == b'\xe2\x88\x86\xc6\x92\xcb\x86\xc6\x92\xc2\xb4\xc2\xb4lucas yang'
class MThreadingTestCase(unittest.TestCase):