Improve mthreading.py code, add override threads option, remove unused

This commit is contained in:
Lucas Ou-Yang 2018-09-01 16:43:46 -07:00
parent beacce0e16
commit f1079bdcd9

View file

@ -11,6 +11,7 @@ __copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import queue
import traceback
from threading import Thread
from .configuration import Configuration
@ -80,7 +81,6 @@ class NewsPool(object):
>>> cnn_paper.articles[50].html
u'<html>blahblah ... '
"""
self.papers = []
self.pool = None
self.config = config or Configuration()
@ -94,17 +94,33 @@ class NewsPool(object):
'objects before .join(..)')
raise
self.pool.wait_completion()
self.papers = []
self.pool = None
def set(self, paper_list, threads_per_source=1):
self.papers = paper_list
num_threads = threads_per_source * len(self.papers)
def set(self, news_list, threads_per_source=1, override_threads=None):
"""
news_list can be a list of `Article`, `Source`, or both.
If caller wants to decide how many threads to use, they can use
`override_threads` which takes precedence over all. Otherwise,
this api infers that if the input is all `Source` objects, to
allocate one thread per `Source` to not spam the host.
If both of the above conditions are not true, default to 1 thread.
"""
from .source import Source
if override_threads is not None:
num_threads = override_threads
elif all([isinstance(n, Source) for n in news_list]):
num_threads = threads_per_source * len(news_list)
else:
num_threads = 1
timeout = self.config.thread_timeout_seconds
self.pool = ThreadPool(num_threads, timeout)
for paper in self.papers:
try:
self.pool.add_task(paper.download_articles)
except AttributeError:
self.pool.add_task(paper.download)
for news_object in news_list:
if isinstance(news_object, Source):
self.pool.add_task(news_object.download_articles)
else:
self.pool.add_task(news_object.download)