mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
added meta refresh redirect support
This commit is contained in:
parent
7622ec20f2
commit
308469838f
6 changed files with 60 additions and 1 deletions
|
|
@ -219,6 +219,8 @@ Here is a full list of the configuration options:
|
|||
|
||||
``fetch_images``, default True, "set this to false if you don't care about getting images"
|
||||
|
||||
``follow_meta_refresh``, default False, "follows a redirect url in a meta refresh html tag"
|
||||
|
||||
``image_dimension_ration``, default 16/9.0, "max ratio for height/width, we ignore if greater"
|
||||
|
||||
``language``, default 'en', "run ``newspaper.languages()`` to see available options."
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from .configuration import Configuration
|
|||
from .extractors import ContentExtractor
|
||||
from .outputformatters import OutputFormatter
|
||||
from .utils import (URLHelper, RawHelper, extend_config,
|
||||
get_available_languages)
|
||||
get_available_languages, extract_meta_refresh)
|
||||
from .videos.extractors import VideoExtractor
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
|
@ -147,6 +147,12 @@ class Article(object):
|
|||
"""
|
||||
if html is None:
|
||||
html = network.get_html(self.url, self.config)
|
||||
|
||||
if self.config.follow_meta_refresh:
|
||||
meta_refresh_url = extract_meta_refresh(html)
|
||||
if meta_refresh_url:
|
||||
return self.download(html=network.get_html(meta_refresh_url))
|
||||
|
||||
self.set_html(html)
|
||||
|
||||
if title is not None:
|
||||
|
|
|
|||
|
|
@ -46,6 +46,9 @@ class Configuration(object):
|
|||
self.fetch_images = True
|
||||
self.image_dimension_ration = 16 / 9.0
|
||||
|
||||
# Follow meta refresh redirect when downloading
|
||||
self.follow_meta_refresh = False
|
||||
|
||||
# Don't toggle this variable, done internally
|
||||
self.use_meta_language = True
|
||||
|
||||
|
|
@ -58,6 +61,7 @@ class Configuration(object):
|
|||
# English is the fallback
|
||||
self._language = 'en'
|
||||
|
||||
|
||||
# Unique stopword classes for oriental languages, don't toggle
|
||||
self.stopwords_class = StopWords
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ import time
|
|||
|
||||
from hashlib import sha1
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from . import settings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
|
@ -179,6 +181,21 @@ def is_ascii(word):
|
|||
return True
|
||||
|
||||
|
||||
def extract_meta_refresh(html):
|
||||
""" Parses html for a tag like:
|
||||
<meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
|
||||
Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/
|
||||
5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
element = soup.find('meta', attrs={'http-equiv': 'refresh'})
|
||||
if element:
|
||||
wait, text = element["content"].split(";")
|
||||
if text.lower().startswith("url="):
|
||||
return text[4:].replace("'", '')
|
||||
return None
|
||||
|
||||
|
||||
def to_valid_filename(s):
|
||||
"""Converts arbitrary string (for us domain name)
|
||||
into a valid file name for caching
|
||||
|
|
|
|||
19
tests/data/html/google_meta_refresh.html
Normal file
19
tests/data/html/google_meta_refresh.html
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
<script>window.googleJavaScriptRedirect = 1</script>
|
||||
<script>var n = {
|
||||
navigateTo: function (b, a, d) {
|
||||
if (b != a && b.google) {
|
||||
if (b.google.r) {
|
||||
b.google.r = 0;
|
||||
b.location.href = d;
|
||||
a.location.replace("about:blank");
|
||||
}
|
||||
} else {
|
||||
a.location.replace(d);
|
||||
}
|
||||
}
|
||||
};
|
||||
n.navigateTo(window.parent, window, "http://sfbay.craigslist.org/eby/cto/5617800926.html");
|
||||
</script>
|
||||
<noscript>
|
||||
<META http-equiv="refresh" content="0;URL='http://example.com'">
|
||||
</noscript>
|
||||
11
tests/unit_tests.py
vendored
11
tests/unit_tests.py
vendored
|
|
@ -178,6 +178,17 @@ class ArticleTestCase(unittest.TestCase):
|
|||
self.article.download(html)
|
||||
self.assertEqual(75406, len(self.article.html))
|
||||
|
||||
@print_test
|
||||
def test_meta_refresh_redirect(self):
|
||||
config = Configuration()
|
||||
config.follow_meta_refresh = True
|
||||
article = Article(
|
||||
'', config=config)
|
||||
html = mock_resource_with('google_meta_refresh', 'html')
|
||||
article.download(html=html)
|
||||
article.parse()
|
||||
self.assertEqual(article.title, 'Example Domain')
|
||||
|
||||
@print_test
|
||||
def test_pre_download_parse(self):
|
||||
"""Calling `parse()` before `download()` should yield an error
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue