added meta refresh redirect support

This commit is contained in:
Logan Head 2016-06-10 15:03:15 -04:00
parent 7622ec20f2
commit 308469838f
6 changed files with 60 additions and 1 deletions

View file

@ -219,6 +219,8 @@ Here is a full list of the configuration options:
``fetch_images``, default True, "set this to false if you don't care about getting images"
``follow_meta_refresh``, default False, "follows a redirect url in a meta refresh html tag"
``image_dimension_ration``, default 16/9.0, "max ratio for height/width, we ignore if greater"
``language``, default 'en', "run ``newspaper.languages()`` to see available options."

View file

@ -20,7 +20,7 @@ from .configuration import Configuration
from .extractors import ContentExtractor
from .outputformatters import OutputFormatter
from .utils import (URLHelper, RawHelper, extend_config,
get_available_languages)
get_available_languages, extract_meta_refresh)
from .videos.extractors import VideoExtractor
log = logging.getLogger(__name__)
@ -147,6 +147,12 @@ class Article(object):
"""
if html is None:
html = network.get_html(self.url, self.config)
if self.config.follow_meta_refresh:
meta_refresh_url = extract_meta_refresh(html)
if meta_refresh_url:
return self.download(html=network.get_html(meta_refresh_url))
self.set_html(html)
if title is not None:

View file

@ -46,6 +46,9 @@ class Configuration(object):
self.fetch_images = True
self.image_dimension_ration = 16 / 9.0
# Follow meta refresh redirect when downloading
self.follow_meta_refresh = False
# Don't toggle this variable, done internally
self.use_meta_language = True
@ -58,6 +61,7 @@ class Configuration(object):
# English is the fallback
self._language = 'en'
# Unique stopword classes for oriental languages, don't toggle
self.stopwords_class = StopWords

View file

@ -22,6 +22,8 @@ import time
from hashlib import sha1
from bs4 import BeautifulSoup
from . import settings
log = logging.getLogger(__name__)
@ -179,6 +181,21 @@ def is_ascii(word):
return True
def extract_meta_refresh(html):
""" Parses html for a tag like:
<meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/
5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg
"""
soup = BeautifulSoup(html, 'html.parser')
element = soup.find('meta', attrs={'http-equiv': 'refresh'})
if element:
wait, text = element["content"].split(";")
if text.lower().startswith("url="):
return text[4:].replace("'", '')
return None
def to_valid_filename(s):
"""Converts arbitrary string (for us domain name)
into a valid file name for caching

View file

@ -0,0 +1,19 @@
<script>window.googleJavaScriptRedirect = 1</script>
<script>var n = {
navigateTo: function (b, a, d) {
if (b != a && b.google) {
if (b.google.r) {
b.google.r = 0;
b.location.href = d;
a.location.replace("about:blank");
}
} else {
a.location.replace(d);
}
}
};
n.navigateTo(window.parent, window, "http://sfbay.craigslist.org/eby/cto/5617800926.html");
</script>
<noscript>
<META http-equiv="refresh" content="0;URL='http://example.com'">
</noscript>

11
tests/unit_tests.py vendored
View file

@ -178,6 +178,17 @@ class ArticleTestCase(unittest.TestCase):
self.article.download(html)
self.assertEqual(75406, len(self.article.html))
@print_test
def test_meta_refresh_redirect(self):
config = Configuration()
config.follow_meta_refresh = True
article = Article(
'', config=config)
html = mock_resource_with('google_meta_refresh', 'html')
article.download(html=html)
article.parse()
self.assertEqual(article.title, 'Example Domain')
@print_test
def test_pre_download_parse(self):
"""Calling `parse()` before `download()` should yield an error