This commit is contained in:
felix 2017-02-24 17:53:48 +01:00
parent 331a60ebff
commit b78bb39ddc
4 changed files with 44 additions and 46 deletions

View file

@ -1,4 +1,5 @@
include newsplease/config/config.cfg
include newsplease/config/config_lib.cfg
include newsplease/config/sitelist.hjson
include LICENSE.txt
include README.md

View file

@ -0,0 +1,42 @@
import sys
import os
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from newsplease.pipeline.pipelines import InMemoryStorage
from newsplease.single_crawler import SingleCrawler
class NewsPleaseLib:
"""
Access news-please functionality via this interface
"""
@staticmethod
def download_article(url):
"""
Crawls the article from the url and extracts relevant information.
:param url:
:return:
"""
SingleCrawler.create_as_library(url)
results = InMemoryStorage.get_results()
article = results[url]
del results[url]
return article
@staticmethod
def download_articles(urls):
"""
Crawls articles from the urls and extracts relevant information.
:param urls:
:return:
"""
SingleCrawler.create_as_library(urls)
results = InMemoryStorage.get_results()
articles = []
for url in urls:
article = results[url]
del results[url]
articles.append(article)
print(article['title'])
return articles

View file

@ -1,45 +0,0 @@
import sys
import os
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from newsplease.pipeline.pipelines import InMemoryStorage
from newsplease.single_crawler import SingleCrawler
class NewsPleaseLib:
"""
Access news-please functionality via this interface
"""
@staticmethod
def download_article(url):
"""
Crawls the article from the url and extracts relevant information.
:param url:
:return:
"""
SingleCrawler.create_as_library(url)
results = InMemoryStorage.get_results()
article = results[url]
del results[url]
return article
@staticmethod
def download_articles(urls):
"""
Crawls articles from the urls and extracts relevant information.
:param urls:
:return:
"""
SingleCrawler.create_as_library(urls)
results = InMemoryStorage.get_results()
articles = []
for url in urls:
article = results[url]
del results[url]
articles.append(article)
print(article['title'])
return articles
if __name__ == '__main__':
NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan')

View file

@ -4,7 +4,7 @@ import sys, os
setup(name='news-please',
version='1.0.25',
version='1.0.27',
description="news-please is an open source easy-to-use news extractor that just works.",
long_description="""\
news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website.""",