mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
Handle file scheme in Article.download (#598)
- Update the Article download function to handle the file scheme. - Add test cases for using Article.download with a file url
This commit is contained in:
parent
d1766a8b84
commit
4a540cbcd9
2 changed files with 47 additions and 5 deletions
|
|
@ -8,6 +8,7 @@ import logging
|
|||
import copy
|
||||
import os
|
||||
import glob
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
|
@ -158,6 +159,23 @@ class Article(object):
|
|||
self.parse()
|
||||
self.nlp()
|
||||
|
||||
def _parse_scheme_file(self, path):
|
||||
try:
|
||||
with open(path, "r") as fin:
|
||||
return fin.read()
|
||||
except OSError as e:
|
||||
self.download_state = ArticleDownloadState.FAILED_RESPONSE
|
||||
self.download_exception_msg = e.strerror
|
||||
return None
|
||||
|
||||
def _parse_scheme_http(self):
|
||||
try:
|
||||
return network.get_html_2XX_only(self.url, self.config)
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.download_state = ArticleDownloadState.FAILED_RESPONSE
|
||||
self.download_exception_msg = str(e)
|
||||
return None
|
||||
|
||||
def download(self, input_html=None, title=None, recursion_counter=0):
|
||||
"""Downloads the link's HTML content, don't use if you are batch async
|
||||
downloading articles
|
||||
|
|
@ -166,11 +184,12 @@ class Article(object):
|
|||
infinite
|
||||
"""
|
||||
if input_html is None:
|
||||
try:
|
||||
html = network.get_html_2XX_only(self.url, self.config)
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.download_state = ArticleDownloadState.FAILED_RESPONSE
|
||||
self.download_exception_msg = str(e)
|
||||
parsed_url = urlparse(self.url)
|
||||
if parsed_url.scheme == "file":
|
||||
html = self._parse_scheme_file(parsed_url.path)
|
||||
else:
|
||||
html = self._parse_scheme_http()
|
||||
if html is None:
|
||||
log.debug('Download failed on URL %s because of %s' %
|
||||
(self.url, self.download_exception_msg))
|
||||
return
|
||||
|
|
|
|||
23
tests/unit_tests.py
vendored
23
tests/unit_tests.py
vendored
|
|
@ -25,6 +25,7 @@ URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt')
|
|||
|
||||
import newspaper
|
||||
from newspaper import Article, fulltext, Source, ArticleException, news_pool
|
||||
from newspaper.article import ArticleDownloadState
|
||||
from newspaper.configuration import Configuration
|
||||
from newspaper.urls import get_domain
|
||||
|
||||
|
|
@ -177,6 +178,8 @@ class ArticleTestCase(unittest.TestCase):
|
|||
self.setup_stage('download')
|
||||
html = mock_resource_with('cnn_article', 'html')
|
||||
self.article.download(html)
|
||||
self.assertEqual(self.article.download_state, ArticleDownloadState.SUCCESS)
|
||||
self.assertEqual(self.article.download_exception_msg, None)
|
||||
self.assertEqual(75406, len(self.article.html))
|
||||
|
||||
@print_test
|
||||
|
|
@ -322,6 +325,26 @@ class ArticleTestCase(unittest.TestCase):
|
|||
self.assertCountEqual(KEYWORDS, self.article.keywords)
|
||||
|
||||
|
||||
class TestDownloadScheme(unittest.TestCase):
|
||||
@print_test
|
||||
def test_download_file_success(self):
|
||||
url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
|
||||
article = Article(url=url)
|
||||
article.download()
|
||||
self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS)
|
||||
self.assertEqual(article.download_exception_msg, None)
|
||||
self.assertEqual(75406, len(article.html))
|
||||
|
||||
@print_test
|
||||
def test_download_file_failure(self):
|
||||
url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
|
||||
article = Article(url=url)
|
||||
article.download()
|
||||
self.assertEqual(0, len(article.html))
|
||||
self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE)
|
||||
self.assertEqual(article.download_exception_msg, "No such file or directory")
|
||||
|
||||
|
||||
class ContentExtractorTestCase(unittest.TestCase):
|
||||
"""Test specific element extraction cases"""
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue