From 15bbd8a9dbd2f5285ba8919787efeeebaf563a42 Mon Sep 17 00:00:00 2001 From: Lucas Ou-Yang Date: Thu, 9 Jan 2014 00:28:14 -0800 Subject: [PATCH] added complete test cases for config setting --- newspaper/api.py | 26 ++++++++++++++------------ newspaper/article.py | 14 ++------------ newspaper/source.py | 14 ++------------ newspaper/utils/__init__.py | 11 +++++++++++ tests/unit_tests.py | 14 +++++++++++++- 5 files changed, 42 insertions(+), 37 deletions(-) diff --git a/newspaper/api.py b/newspaper/api.py index 5c90e41..71211e7 100644 --- a/newspaper/api.py +++ b/newspaper/api.py @@ -13,32 +13,34 @@ from .settings import POPULAR_URLS, TRENDING_URL from .configuration import Configuration from .mthreading import NewsPool from .configuration import Configuration -from .utils import print_available_languages +from .utils import print_available_languages, extend_config -def build(url=u'', config=None): +def build(url=u'', dry=False, config=None, **kwargs): """ Returns a constructed source object without downloading or parsing the articles. """ config = config or Configuration() # Order matters - url = url or '' # Empty string precedence over None - valid_href = ('://' in url) and (url[:4] == 'http') + config = extend_config(config, kwargs) - if not valid_href: - print 'ERR: provide a valid url' - return None + url = url or '' + s = Source(url, config=config) - s = Source(url, config) - s.build() + # dry means we are just testing, don't actually build source + if not dry: + s.build() return s -def build_article(url=u''): +def build_article(url=u'', config=None, **kwargs): """ Returns a constructed article object without downloading or parsing. """ - url = url or '' # empty string precedence over None - a = Article(url) + config = config or Configuration() # Order matters + config = extend_config(config, kwargs) + + url = url or '' + a = Article(url, config=config) return a def languages(): diff --git a/newspaper/article.py b/newspaper/article.py index 722ae77..7aceb3e 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -18,7 +18,7 @@ from . import network from . import settings from .configuration import Configuration from .extractors import StandardContentExtractor -from .utils import URLHelper, encodeValue, RawHelper +from .utils import URLHelper, encodeValue, RawHelper, extend_config from .cleaners import StandardDocumentCleaner from .outputformatters import StandardOutputFormatter from .videos.extractors import VideoExtractor @@ -33,23 +33,13 @@ class ArticleException(Exception): class Article(object): """ """ - def extend_config(self, config_items): - """ - We are handling config value setting like this for a cleaner api. - Users just need to pass in a named param to this article and we - can dynamically set a config object for it. - """ - for key, val in config_items.items(): - if hasattr(self.config, key): - setattr(self.config, key, val) - def __init__(self, url, title=u'', source_url=u'', config=None, **kwargs): """ The **kwargs arguement can be filled with config values which we then push in. """ self.config = config or Configuration() - self.extend_config(kwargs) + self.config = extend_config(self.config, kwargs) self.parser = self.config.get_parser() self.extractor = StandardContentExtractor(config=self.config) diff --git a/newspaper/source.py b/newspaper/source.py index 1e89af8..37efb6b 100644 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -20,7 +20,7 @@ from .extractors import StandardContentExtractor from .urls import ( get_domain, get_scheme, prepare_url) from .utils import ( - memoize_articles, cache_disk, clear_memo_cache, encodeValue) + memoize_articles, cache_disk, clear_memo_cache, encodeValue, extend_config) log = logging.getLogger(__name__) @@ -52,16 +52,6 @@ class Source(object): articles = [
,
, ..] brand = 'cnn' """ - def extend_config(self, config_items): - """ - We are handling config value setting like this for a cleaner api. - Users just need to pass in a named param to this source and we can - dynamically generate a config object for it. - """ - for key, val in config_items.items(): - if hasattr(self.config, key): - setattr(self.config, key, val) - def __init__(self, url, config=None, **kwargs): """ **The config object for this source will be passed into all of this @@ -71,7 +61,7 @@ class Source(object): raise Exception('Input url is bad!') self.config = config or Configuration() # Order matters - self.extend_config(kwargs) + self.config = extend_config(self.config, kwargs) self.parser = self.config.get_parser() self.extractor = StandardContentExtractor(config=self.config) diff --git a/newspaper/utils/__init__.py b/newspaper/utils/__init__.py index 768919d..81519e9 100644 --- a/newspaper/utils/__init__.py +++ b/newspaper/utils/__init__.py @@ -369,4 +369,15 @@ def print_available_languages(): print ' %s\t\t\t %s' % (code, language_dict[code]) print +def extend_config(config, config_items): + """ + We are handling config value setting like this for a cleaner api. + Users just need to pass in a named param to this source and we can + dynamically generate a config object for it. + """ + for key, val in config_items.items(): + if hasattr(config, key): + setattr(config, key, val) + + return config diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 674dcdd..84fea3a 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -232,7 +232,7 @@ class APITestCase(unittest.TestCase): @print_test def test_source_build(self): - huff_paper = newspaper.build('http://www.huffingtonpost.com/') + huff_paper = newspaper.build('http://www.huffingtonpost.com/', dry=True) assert isinstance(huff_paper, Source) == True @print_test @@ -346,6 +346,18 @@ class ConfigBuildTestCase(unittest.TestCase): assert s.config.language == 'en' assert s.config.use_meta_language == False + s = newspaper.build('http://cnn.com', dry=True) + assert s.config.language == 'en' + assert s.config.MAX_FILE_MEMO == 20000 + assert s.config.memoize_articles == True + assert s.config.use_meta_language == True + + s = newspaper.build('http://cnn.com', dry=True, memoize_articles=False, + MAX_FILE_MEMO=10000, language='zh') + assert s.config.language == 'zh' + assert s.config.MAX_FILE_MEMO == 10000 + assert s.config.memoize_articles == False + assert s.config.use_meta_language == False class MultiLanguageTestCase(unittest.TestCase): def runTest(self):