Handle file scheme in Article.download (#598)

- Update the Article download function to handle the file scheme. - Add test cases for using Article.download with a file url
2025-12-23 05:36:50 +00:00 · 2018-10-04 00:11:50 -05:00 · 2018-10-04 00:11:50 -05:00 · 4a540cbcd9
commit 4a540cbcd9
parent d1766a8b84
2 changed files with 47 additions and 5 deletions
--- a/newspaper/article.py
+++ b/newspaper/article.py
@ -8,6 +8,7 @@ import logging
 import copy
 import os
 import glob
+from urllib.parse import urlparse

 import requests

@ -158,6 +159,23 @@ class Article(object):
        self.parse()
        self.nlp()

+    def _parse_scheme_file(self, path):
+        try:
+            with open(path, "r") as fin:
+                return fin.read()
+        except OSError as e:
+            self.download_state = ArticleDownloadState.FAILED_RESPONSE
+            self.download_exception_msg = e.strerror
+            return None
+
+    def _parse_scheme_http(self):
+        try:
+            return network.get_html_2XX_only(self.url, self.config)
+        except requests.exceptions.RequestException as e:
+            self.download_state = ArticleDownloadState.FAILED_RESPONSE
+            self.download_exception_msg = str(e)
+            return None
+
    def download(self, input_html=None, title=None, recursion_counter=0):
        """Downloads the link's HTML content, don't use if you are batch async
        downloading articles
@ -166,11 +184,12 @@ class Article(object):
        infinite
        """
        if input_html is None:
-            try:
-                html = network.get_html_2XX_only(self.url, self.config)
-            except requests.exceptions.RequestException as e:
-                self.download_state = ArticleDownloadState.FAILED_RESPONSE
-                self.download_exception_msg = str(e)
+            parsed_url = urlparse(self.url)
+            if parsed_url.scheme == "file":
+                html = self._parse_scheme_file(parsed_url.path)
+            else:
+                html = self._parse_scheme_http()
+            if html is None:
                log.debug('Download failed on URL %s because of %s' %
                          (self.url, self.download_exception_msg))
                return
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@ -25,6 +25,7 @@ URLS_FILE = os.path.join(TEST_DIR, 'data', 'fulltext_url_list.txt')

 import newspaper
 from newspaper import Article, fulltext, Source, ArticleException, news_pool
+from newspaper.article import ArticleDownloadState
 from newspaper.configuration import Configuration
 from newspaper.urls import get_domain

@ -177,6 +178,8 @@ class ArticleTestCase(unittest.TestCase):
        self.setup_stage('download')
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
+        self.assertEqual(self.article.download_state, ArticleDownloadState.SUCCESS)
+        self.assertEqual(self.article.download_exception_msg, None)
        self.assertEqual(75406, len(self.article.html))

    @print_test
@ -322,6 +325,26 @@ class ArticleTestCase(unittest.TestCase):
        self.assertCountEqual(KEYWORDS, self.article.keywords)


+class TestDownloadScheme(unittest.TestCase):
+    @print_test
+    def test_download_file_success(self):
+        url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
+        article = Article(url=url)
+        article.download()
+        self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS)
+        self.assertEqual(article.download_exception_msg, None)
+        self.assertEqual(75406, len(article.html))
+
+    @print_test
+    def test_download_file_failure(self):
+        url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
+        article = Article(url=url)
+        article.download()
+        self.assertEqual(0, len(article.html))
+        self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE)
+        self.assertEqual(article.download_exception_msg, "No such file or directory")
+
+
 class ContentExtractorTestCase(unittest.TestCase):
    """Test specific element extraction cases"""