Include meta image url to images collection

This commit is contained in:
Oleg Temnov 2014-02-03 11:38:51 +04:00
parent 1f134ac632
commit e278edf704
3 changed files with 16 additions and 7 deletions

View file

@ -61,6 +61,9 @@ class Article(object):
# the url of the "best image" to represent this article, via reddit algorithm
self.top_img = self.top_image = u''
# stores image provided by metadata
self.meta_img = u''
self.imgs = self.images = [] # all image urls
self.movies = [] # youtube, vimeo, etc
@ -215,11 +218,11 @@ class Article(object):
def fetch_images(self):
if self.raw_doc is not None:
img_url = self.extractor.get_top_img_url(self)
self.set_top_img(img_url)
meta_img_url = self.extractor.get_meta_img_url(self)
self.set_meta_img(meta_img_url)
top_imgs = self.extractor.get_img_urls(self)
self.set_imgs(top_imgs)
imgs = self.extractor.get_img_urls(self)
self.set_imgs(imgs)
if self.clean_top_node is not None and not self.has_top_image():
first_img = self.extractor.get_first_img_url(self.clean_top_node)
@ -414,6 +417,10 @@ class Article(object):
"""
if article_html:
self.article_html = encodeValue(article_html)
def set_meta_img(self, src_url):
self.meta_img = encodeValue(src_url)
self.set_top_img(src_url)
def set_top_img(self, src_url):
if src_url is not None:

View file

@ -357,6 +357,8 @@ class ContentExtractor(object):
doc = article.raw_doc
urls = self.parser.get_img_urls(doc)
img_links = set([ urlparse.urljoin(article.url, url) for url in urls ])
if article.meta_img:
img_links.add(article.meta_img)
return img_links
def get_first_img_url(self, node):
@ -365,12 +367,12 @@ class ContentExtractor(object):
return node_images[0]
return u''
def get_top_img_url(self, article):
def get_meta_img_url(self, article):
"""
"""
# !important, we must use raw_doc because at this point doc has been cleaned
doc = article.raw_doc
return self.parser.get_top_img_url(doc)
return self.parser.get_meta_img_url(doc)
def get_category_urls(self, source, source_url=None, page_urls=None):
"""

View file

@ -74,7 +74,7 @@ class Parser(object):
return cls.root_to_urls(doc, titles)
@classmethod
def get_top_img_url(cls, doc):
def get_meta_img_url(cls, doc):
"""
Takes an lxml doc and returns the top img url
running as method == 'soup' assumes lxml's soupparser.