mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
Include meta image url to images collection
This commit is contained in:
parent
1f134ac632
commit
e278edf704
3 changed files with 16 additions and 7 deletions
|
|
@ -61,6 +61,9 @@ class Article(object):
|
|||
# the url of the "best image" to represent this article, via reddit algorithm
|
||||
self.top_img = self.top_image = u''
|
||||
|
||||
# stores image provided by metadata
|
||||
self.meta_img = u''
|
||||
|
||||
self.imgs = self.images = [] # all image urls
|
||||
self.movies = [] # youtube, vimeo, etc
|
||||
|
||||
|
|
@ -215,11 +218,11 @@ class Article(object):
|
|||
|
||||
def fetch_images(self):
|
||||
if self.raw_doc is not None:
|
||||
img_url = self.extractor.get_top_img_url(self)
|
||||
self.set_top_img(img_url)
|
||||
meta_img_url = self.extractor.get_meta_img_url(self)
|
||||
self.set_meta_img(meta_img_url)
|
||||
|
||||
top_imgs = self.extractor.get_img_urls(self)
|
||||
self.set_imgs(top_imgs)
|
||||
imgs = self.extractor.get_img_urls(self)
|
||||
self.set_imgs(imgs)
|
||||
|
||||
if self.clean_top_node is not None and not self.has_top_image():
|
||||
first_img = self.extractor.get_first_img_url(self.clean_top_node)
|
||||
|
|
@ -414,6 +417,10 @@ class Article(object):
|
|||
"""
|
||||
if article_html:
|
||||
self.article_html = encodeValue(article_html)
|
||||
|
||||
def set_meta_img(self, src_url):
|
||||
self.meta_img = encodeValue(src_url)
|
||||
self.set_top_img(src_url)
|
||||
|
||||
def set_top_img(self, src_url):
|
||||
if src_url is not None:
|
||||
|
|
|
|||
|
|
@ -357,6 +357,8 @@ class ContentExtractor(object):
|
|||
doc = article.raw_doc
|
||||
urls = self.parser.get_img_urls(doc)
|
||||
img_links = set([ urlparse.urljoin(article.url, url) for url in urls ])
|
||||
if article.meta_img:
|
||||
img_links.add(article.meta_img)
|
||||
return img_links
|
||||
|
||||
def get_first_img_url(self, node):
|
||||
|
|
@ -365,12 +367,12 @@ class ContentExtractor(object):
|
|||
return node_images[0]
|
||||
return u''
|
||||
|
||||
def get_top_img_url(self, article):
|
||||
def get_meta_img_url(self, article):
|
||||
"""
|
||||
"""
|
||||
# !important, we must use raw_doc because at this point doc has been cleaned
|
||||
doc = article.raw_doc
|
||||
return self.parser.get_top_img_url(doc)
|
||||
return self.parser.get_meta_img_url(doc)
|
||||
|
||||
def get_category_urls(self, source, source_url=None, page_urls=None):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ class Parser(object):
|
|||
return cls.root_to_urls(doc, titles)
|
||||
|
||||
@classmethod
|
||||
def get_top_img_url(cls, doc):
|
||||
def get_meta_img_url(cls, doc):
|
||||
"""
|
||||
Takes an lxml doc and returns the top img url
|
||||
running as method == 'soup' assumes lxml's soupparser.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue