added meta refresh redirect support

2025-12-23 05:36:50 +00:00 · 2016-06-10 15:03:15 -04:00 · 2016-06-10 15:03:15 -04:00 · 308469838f
commit 308469838f
parent 7622ec20f2
6 changed files with 60 additions and 1 deletions
--- a/docs/user_guide/advanced.rst
+++ b/docs/user_guide/advanced.rst
@ -219,6 +219,8 @@ Here is a full list of the configuration options:

 ``fetch_images``, default True, "set this to false if you don't care about getting images"

+``follow_meta_refresh``, default False, "follows a redirect url in a meta refresh html tag"
+
 ``image_dimension_ration``, default 16/9.0, "max ratio for height/width, we ignore if greater"

 ``language``, default 'en', "run ``newspaper.languages()`` to see available options."
--- a/newspaper/article.py
+++ b/newspaper/article.py
@ -20,7 +20,7 @@ from .configuration import Configuration
 from .extractors import ContentExtractor
 from .outputformatters import OutputFormatter
 from .utils import (URLHelper, RawHelper, extend_config,
-                    get_available_languages)
+                    get_available_languages, extract_meta_refresh)
 from .videos.extractors import VideoExtractor

 log = logging.getLogger(__name__)
@ -147,6 +147,12 @@ class Article(object):
        """
        if html is None:
            html = network.get_html(self.url, self.config)
+
+        if self.config.follow_meta_refresh:
+            meta_refresh_url = extract_meta_refresh(html)
+            if meta_refresh_url:
+                return self.download(html=network.get_html(meta_refresh_url))
+
        self.set_html(html)

        if title is not None:
--- a/newspaper/configuration.py
+++ b/newspaper/configuration.py
@ -46,6 +46,9 @@ class Configuration(object):
        self.fetch_images = True
        self.image_dimension_ration = 16 / 9.0

+        # Follow meta refresh redirect when downloading
+        self.follow_meta_refresh = False
+
        # Don't toggle this variable, done internally
        self.use_meta_language = True

@ -58,6 +61,7 @@ class Configuration(object):
        # English is the fallback
        self._language = 'en'

+
        # Unique stopword classes for oriental languages, don't toggle
        self.stopwords_class = StopWords

--- a/newspaper/utils.py
+++ b/newspaper/utils.py
@ -22,6 +22,8 @@ import time

 from hashlib import sha1

+from bs4 import BeautifulSoup
+
 from . import settings

 log = logging.getLogger(__name__)
@ -179,6 +181,21 @@ def is_ascii(word):
    return True


+def extract_meta_refresh(html):
+    """ Parses html for a tag like:
+    <meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
+    Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/
+    5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+    element = soup.find('meta', attrs={'http-equiv': 'refresh'})
+    if element:
+        wait, text = element["content"].split(";")
+        if text.lower().startswith("url="):
+            return text[4:].replace("'", '')
+    return None
+
+
 def to_valid_filename(s):
    """Converts arbitrary string (for us domain name)
    into a valid file name for caching
--- a/tests/data/html/google_meta_refresh.html
+++ b/tests/data/html/google_meta_refresh.html
@ -0,0 +1,19 @@
+<script>window.googleJavaScriptRedirect = 1</script>
+<script>var n = {
+    navigateTo: function (b, a, d) {
+        if (b != a && b.google) {
+            if (b.google.r) {
+                b.google.r = 0;
+                b.location.href = d;
+                a.location.replace("about:blank");
+            }
+        } else {
+            a.location.replace(d);
+        }
+    }
+};
+n.navigateTo(window.parent, window, "http://sfbay.craigslist.org/eby/cto/5617800926.html");
+</script>
+<noscript>
+    <META http-equiv="refresh" content="0;URL='http://example.com'">
+</noscript>
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@ -178,6 +178,17 @@ class ArticleTestCase(unittest.TestCase):
        self.article.download(html)
        self.assertEqual(75406, len(self.article.html))

+    @print_test
+    def test_meta_refresh_redirect(self):
+        config = Configuration()
+        config.follow_meta_refresh = True
+        article = Article(
+            '', config=config)
+        html = mock_resource_with('google_meta_refresh', 'html')
+        article.download(html=html)
+        article.parse()
+        self.assertEqual(article.title, 'Example Domain')
+
    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error