From 7c8cf2dcbb9eff210de285d8b1dfebec0526e032 Mon Sep 17 00:00:00 2001 From: Lucas Ou-Yang Date: Mon, 16 Jun 2014 01:49:35 -0700 Subject: [PATCH] refactor meta img extraction --- newspaper/article.py | 2 +- newspaper/extractors.py | 61 +++++++++++++++++++++++++++-------------- newspaper/parsers.py | 6 ++-- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/newspaper/article.py b/newspaper/article.py index f199ef7..575533a 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -249,7 +249,7 @@ class Article(object): if not self.is_parsed: raise ArticleException('must parse article before checking \ if it\'s body is valid!') - meta_type = self.extractor.get_meta_type(self.clean_doc) + meta_type = self.extractor.get_meta_type(self) wordcount = self.text.split(' ') sentcount = self.text.split('.') diff --git a/newspaper/extractors.py b/newspaper/extractors.py index fd9fce2..60876b5 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -124,7 +124,7 @@ class ContentExtractor(object): VALS = ['author', 'byline'] matches = [] _authors, authors = [], [] - doc = article.doc + doc = article.clean_doc html = article.html for attr in ATTRS: @@ -173,7 +173,7 @@ class ContentExtractor(object): Fetch the article title and analyze it. """ title = '' - doc = article.doc + doc = article.clean_doc title_element = self.parser.getElementsByTag(doc, tag='title') # no title found @@ -234,7 +234,7 @@ class ContentExtractor(object): """ kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} - meta = self.parser.getElementsByTag(article.doc, **kwargs) + meta = self.parser.getElementsByTag(article.clean_doc, **kwargs) if meta: favicon = self.parser.getAttribute(meta[0], 'href') return favicon @@ -245,7 +245,7 @@ class ContentExtractor(object): Extract content language from meta. """ # we have a lang attribute in html - attr = self.parser.getAttribute(article.doc, attr='lang') + attr = self.parser.getAttribute(article.clean_doc, attr='lang') if attr is None: # look up for a Content-Language in meta items = [ @@ -253,7 +253,7 @@ class ContentExtractor(object): {'tag': 'meta', 'attr': 'name', 'value': 'lang'} ] for item in items: - meta = self.parser.getElementsByTag(article.doc, **item) + meta = self.parser.getElementsByTag(article.clean_doc, **item) if meta: attr = self.parser.getAttribute(meta[0], attr='content') break @@ -283,27 +283,56 @@ class ContentExtractor(object): return '' + def get_meta_img_url(self, article): + """ + Returns the 'top img' as specified by the website. + """ + top_meta_image, try_one, try_two, try_three, try_four = [None] * 5 + doc = article.clean_doc + try_one = self.get_meta_content(doc, 'meta[property="og:image"]') + + if try_one is None: + link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} + try_two = self.parser.getElementsByTag(doc, **link_icon_kwargs) + + if try_two is None: + link_img_src_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'img_src'} + try_three = self.parser.getElementsByTag(doc, **link_img_src_kwargs) + + if try_three is None: + try_four = self.get_meta_content(doc, 'meta[name="og:image"]') + + top_meta_image = try_one or try_two or try_three or try_four # :) + + return urlparse.urljoin(article.url, top_meta_image) + def get_meta_type(self, article): """ Returns meta type of article, open graph protocol. """ - return self.get_meta_content(article.doc, 'meta[property="og:type"]') + return self.get_meta_content(article.clean_doc, 'meta[property="og:type"]') - def get_meta_description(self, article): + def get_meta_description(self, article_or_source): """ If the article has meta description set in the source, use that. """ - return self.get_meta_content(article.doc, "meta[name=description]") + # Since objects use this particular method and sources don't + # have a 'clean_doc' we just use doc + try: # "easier to ask for forgiveness than permission" + doc = article_or_source.clean_doc + except: + doc = article_or_source.doc + return self.get_meta_content(doc, "meta[name=description]") def get_meta_keywords(self, article): """ If the article has meta keywords set in the source, use that. """ - return self.get_meta_content(article.doc, "meta[name=keywords]") + return self.get_meta_content(article.clean_doc, "meta[name=keywords]") def get_meta_data(self, article): data = defaultdict(dict) - props = self.parser.css_select(article.doc, 'meta') + props = self.parser.css_select(article.clean_doc, 'meta') for prop in props: key = prop.attrib.get('property') @@ -347,7 +376,7 @@ class ContentExtractor(object): If the article has meta canonical link set in the url. """ kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'} - meta = self.parser.getElementsByTag(article.doc, **kwargs) + meta = self.parser.getElementsByTag(article.clean_doc, **kwargs) if meta is not None and len(meta) > 0: href = self.parser.getAttribute(meta[0], 'href') if href: @@ -377,14 +406,6 @@ class ContentExtractor(object): return urlparse.urljoin(article.url, node_images[0]) return u'' - def get_meta_img_url(self, article): - """ - """ - # !important, we must use clean_doc because at this point doc has been cleaned - doc = article.clean_doc - meta_img_url = self.parser.get_meta_img_url(doc) - return urlparse.urljoin(article.url, meta_img_url) - def get_category_urls(self, source, source_url=None, page_urls=None): """ Requires: source lxml root and source url takes a domain and finds all of the @@ -524,7 +545,7 @@ class ContentExtractor(object): return feeds def extract_tags(self, article): - node = article.doc + node = article.clean_doc # node doesn't have chidren if len(list(node)) == 0: diff --git a/newspaper/parsers.py b/newspaper/parsers.py index 46acec3..fa0ebdc 100644 --- a/newspaper/parsers.py +++ b/newspaper/parsers.py @@ -73,12 +73,9 @@ class Parser(object): return cls.root_to_urls(doc, titles) + """ @classmethod def get_meta_img_url(cls, doc): - """ - Takes an lxml doc and returns the top img url - running as method == 'soup' assumes lxml's soupparser. - """ try: return doc.xpath('/html/head/meta[@property="og:image"][1]/@content')[0] except: @@ -96,6 +93,7 @@ class Parser(object): except: pass return None + """ @classmethod def get_img_urls(cls, doc):