Handle file scheme in Article.download (#598)

- Update the Article download function to handle the file scheme.
 - Add test cases for using Article.download with a file url
This commit is contained in:
Dan Robertson 2018-10-04 00:11:50 -05:00 committed by Lucas Ou-Yang
parent d1766a8b84
commit 4a540cbcd9
2 changed files with 47 additions and 5 deletions

View file

@ -8,6 +8,7 @@ import logging
import copy
import os
import glob
from urllib.parse import urlparse
import requests
@ -158,6 +159,23 @@ class Article(object):
self.parse()
self.nlp()
def _parse_scheme_file(self, path):
try:
with open(path, "r") as fin:
return fin.read()
except OSError as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
self.download_exception_msg = e.strerror
return None
def _parse_scheme_http(self):
try:
return network.get_html_2XX_only(self.url, self.config)
except requests.exceptions.RequestException as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
self.download_exception_msg = str(e)
return None
def download(self, input_html=None, title=None, recursion_counter=0):
"""Downloads the link's HTML content, don't use if you are batch async
downloading articles
@ -166,11 +184,12 @@ class Article(object):
infinite
"""
if input_html is None:
try:
html = network.get_html_2XX_only(self.url, self.config)
except requests.exceptions.RequestException as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
self.download_exception_msg = str(e)
parsed_url = urlparse(self.url)
if parsed_url.scheme == "file":
html = self._parse_scheme_file(parsed_url.path)
else:
html = self._parse_scheme_http()
if html is None:
log.debug('Download failed on URL %s because of %s' %
(self.url, self.download_exception_msg))
return

23
tests/unit_tests.py vendored
View file

@ -25,6 +25,7 @@ URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt')
import newspaper
from newspaper import Article, fulltext, Source, ArticleException, news_pool
from newspaper.article import ArticleDownloadState
from newspaper.configuration import Configuration
from newspaper.urls import get_domain
@ -177,6 +178,8 @@ class ArticleTestCase(unittest.TestCase):
self.setup_stage('download')
html = mock_resource_with('cnn_article', 'html')
self.article.download(html)
self.assertEqual(self.article.download_state, ArticleDownloadState.SUCCESS)
self.assertEqual(self.article.download_exception_msg, None)
self.assertEqual(75406, len(self.article.html))
@print_test
@ -322,6 +325,26 @@ class ArticleTestCase(unittest.TestCase):
self.assertCountEqual(KEYWORDS, self.article.keywords)
class TestDownloadScheme(unittest.TestCase):
@print_test
def test_download_file_success(self):
url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
article = Article(url=url)
article.download()
self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS)
self.assertEqual(article.download_exception_msg, None)
self.assertEqual(75406, len(article.html))
@print_test
def test_download_file_failure(self):
url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
article = Article(url=url)
article.download()
self.assertEqual(0, len(article.html))
self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE)
self.assertEqual(article.download_exception_msg, "No such file or directory")
class ContentExtractorTestCase(unittest.TestCase):
"""Test specific element extraction cases"""