diff --git a/newspaper/article.py b/newspaper/article.py
index f199ef7..575533a 100644
--- a/newspaper/article.py
+++ b/newspaper/article.py
@@ -249,7 +249,7 @@ class Article(object):
if not self.is_parsed:
raise ArticleException('must parse article before checking \
if it\'s body is valid!')
- meta_type = self.extractor.get_meta_type(self.clean_doc)
+ meta_type = self.extractor.get_meta_type(self)
wordcount = self.text.split(' ')
sentcount = self.text.split('.')
diff --git a/newspaper/extractors.py b/newspaper/extractors.py
index fd9fce2..60876b5 100644
--- a/newspaper/extractors.py
+++ b/newspaper/extractors.py
@@ -124,7 +124,7 @@ class ContentExtractor(object):
VALS = ['author', 'byline']
matches = []
_authors, authors = [], []
- doc = article.doc
+ doc = article.clean_doc
html = article.html
for attr in ATTRS:
@@ -173,7 +173,7 @@ class ContentExtractor(object):
Fetch the article title and analyze it.
"""
title = ''
- doc = article.doc
+ doc = article.clean_doc
title_element = self.parser.getElementsByTag(doc, tag='title')
# no title found
@@ -234,7 +234,7 @@ class ContentExtractor(object):
"""
kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
- meta = self.parser.getElementsByTag(article.doc, **kwargs)
+ meta = self.parser.getElementsByTag(article.clean_doc, **kwargs)
if meta:
favicon = self.parser.getAttribute(meta[0], 'href')
return favicon
@@ -245,7 +245,7 @@ class ContentExtractor(object):
Extract content language from meta.
"""
# we have a lang attribute in html
- attr = self.parser.getAttribute(article.doc, attr='lang')
+ attr = self.parser.getAttribute(article.clean_doc, attr='lang')
if attr is None:
# look up for a Content-Language in meta
items = [
@@ -253,7 +253,7 @@ class ContentExtractor(object):
{'tag': 'meta', 'attr': 'name', 'value': 'lang'}
]
for item in items:
- meta = self.parser.getElementsByTag(article.doc, **item)
+ meta = self.parser.getElementsByTag(article.clean_doc, **item)
if meta:
attr = self.parser.getAttribute(meta[0], attr='content')
break
@@ -283,27 +283,56 @@ class ContentExtractor(object):
return ''
+ def get_meta_img_url(self, article):
+ """
+ Returns the 'top img' as specified by the website.
+ """
+ top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
+ doc = article.clean_doc
+ try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
+
+ if try_one is None:
+ link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
+ try_two = self.parser.getElementsByTag(doc, **link_icon_kwargs)
+
+ if try_two is None:
+ link_img_src_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'img_src'}
+ try_three = self.parser.getElementsByTag(doc, **link_img_src_kwargs)
+
+ if try_three is None:
+ try_four = self.get_meta_content(doc, 'meta[name="og:image"]')
+
+ top_meta_image = try_one or try_two or try_three or try_four # :)
+
+ return urlparse.urljoin(article.url, top_meta_image)
+
def get_meta_type(self, article):
"""
Returns meta type of article, open graph protocol.
"""
- return self.get_meta_content(article.doc, 'meta[property="og:type"]')
+ return self.get_meta_content(article.clean_doc, 'meta[property="og:type"]')
- def get_meta_description(self, article):
+ def get_meta_description(self, article_or_source):
"""
If the article has meta description set in the source, use that.
"""
- return self.get_meta_content(article.doc, "meta[name=description]")
+ # Since objects use this particular method and sources don't
+ # have a 'clean_doc' we just use doc
+ try: # "easier to ask for forgiveness than permission"
+ doc = article_or_source.clean_doc
+ except:
+ doc = article_or_source.doc
+ return self.get_meta_content(doc, "meta[name=description]")
def get_meta_keywords(self, article):
"""
If the article has meta keywords set in the source, use that.
"""
- return self.get_meta_content(article.doc, "meta[name=keywords]")
+ return self.get_meta_content(article.clean_doc, "meta[name=keywords]")
def get_meta_data(self, article):
data = defaultdict(dict)
- props = self.parser.css_select(article.doc, 'meta')
+ props = self.parser.css_select(article.clean_doc, 'meta')
for prop in props:
key = prop.attrib.get('property')
@@ -347,7 +376,7 @@ class ContentExtractor(object):
If the article has meta canonical link set in the url.
"""
kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'}
- meta = self.parser.getElementsByTag(article.doc, **kwargs)
+ meta = self.parser.getElementsByTag(article.clean_doc, **kwargs)
if meta is not None and len(meta) > 0:
href = self.parser.getAttribute(meta[0], 'href')
if href:
@@ -377,14 +406,6 @@ class ContentExtractor(object):
return urlparse.urljoin(article.url, node_images[0])
return u''
- def get_meta_img_url(self, article):
- """
- """
- # !important, we must use clean_doc because at this point doc has been cleaned
- doc = article.clean_doc
- meta_img_url = self.parser.get_meta_img_url(doc)
- return urlparse.urljoin(article.url, meta_img_url)
-
def get_category_urls(self, source, source_url=None, page_urls=None):
"""
Requires: source lxml root and source url takes a domain and finds all of the
@@ -524,7 +545,7 @@ class ContentExtractor(object):
return feeds
def extract_tags(self, article):
- node = article.doc
+ node = article.clean_doc
# node doesn't have chidren
if len(list(node)) == 0:
diff --git a/newspaper/parsers.py b/newspaper/parsers.py
index 46acec3..fa0ebdc 100644
--- a/newspaper/parsers.py
+++ b/newspaper/parsers.py
@@ -73,12 +73,9 @@ class Parser(object):
return cls.root_to_urls(doc, titles)
+ """
@classmethod
def get_meta_img_url(cls, doc):
- """
- Takes an lxml doc and returns the top img url
- running as method == 'soup' assumes lxml's soupparser.
- """
try:
return doc.xpath('/html/head/meta[@property="og:image"][1]/@content')[0]
except:
@@ -96,6 +93,7 @@ class Parser(object):
except:
pass
return None
+ """
@classmethod
def get_img_urls(cls, doc):