mirror of
https://github.com/fhamborg/news-please.git
synced 2025-08-04 17:08:01 +00:00
fix bug
This commit is contained in:
parent
331a60ebff
commit
b78bb39ddc
4 changed files with 44 additions and 46 deletions
|
@ -1,4 +1,5 @@
|
|||
include newsplease/config/config.cfg
|
||||
include newsplease/config/config_lib.cfg
|
||||
include newsplease/config/sitelist.hjson
|
||||
include LICENSE.txt
|
||||
include README.md
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from newsplease.pipeline.pipelines import InMemoryStorage
|
||||
from newsplease.single_crawler import SingleCrawler
|
||||
|
||||
|
||||
class NewsPleaseLib:
|
||||
"""
|
||||
Access news-please functionality via this interface
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def download_article(url):
|
||||
"""
|
||||
Crawls the article from the url and extracts relevant information.
|
||||
:param url:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(url)
|
||||
results = InMemoryStorage.get_results()
|
||||
article = results[url]
|
||||
del results[url]
|
||||
return article
|
||||
|
||||
@staticmethod
|
||||
def download_articles(urls):
|
||||
"""
|
||||
Crawls articles from the urls and extracts relevant information.
|
||||
:param urls:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(urls)
|
||||
results = InMemoryStorage.get_results()
|
||||
articles = []
|
||||
for url in urls:
|
||||
article = results[url]
|
||||
del results[url]
|
||||
articles.append(article)
|
||||
print(article['title'])
|
||||
return articles
|
|
@ -1,45 +0,0 @@
|
|||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from newsplease.pipeline.pipelines import InMemoryStorage
|
||||
from newsplease.single_crawler import SingleCrawler
|
||||
|
||||
|
||||
class NewsPleaseLib:
|
||||
"""
|
||||
Access news-please functionality via this interface
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def download_article(url):
|
||||
"""
|
||||
Crawls the article from the url and extracts relevant information.
|
||||
:param url:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(url)
|
||||
results = InMemoryStorage.get_results()
|
||||
article = results[url]
|
||||
del results[url]
|
||||
return article
|
||||
|
||||
@staticmethod
|
||||
def download_articles(urls):
|
||||
"""
|
||||
Crawls articles from the urls and extracts relevant information.
|
||||
:param urls:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(urls)
|
||||
results = InMemoryStorage.get_results()
|
||||
articles = []
|
||||
for url in urls:
|
||||
article = results[url]
|
||||
del results[url]
|
||||
articles.append(article)
|
||||
print(article['title'])
|
||||
return articles
|
||||
|
||||
if __name__ == '__main__':
|
||||
NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan')
|
2
setup.py
2
setup.py
|
@ -4,7 +4,7 @@ import sys, os
|
|||
|
||||
|
||||
setup(name='news-please',
|
||||
version='1.0.25',
|
||||
version='1.0.27',
|
||||
description="news-please is an open source easy-to-use news extractor that just works.",
|
||||
long_description="""\
|
||||
news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website.""",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue