mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
Merge 84f3f7380c into 648fb2a18b
This commit is contained in:
commit
1cf98edd26
2 changed files with 14 additions and 1 deletions
|
|
@ -28,6 +28,12 @@ class DocumentCleaner(object):
|
|||
"|date|^print$|popup|author-dropdown|tools|socialtools|byline"
|
||||
"|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
|
||||
"|legende|ajoutVideo|timestamp|js_replies"
|
||||
"|date|^print$|popup|author-dropdown|tools|socialtools|byline"
|
||||
"|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
|
||||
"|legende|ajoutVideo|timestamp|js_replies|breadcrumb|^rating$""|^comment$|^share$|^like$|^icon$|^count$|^sharing$|^news-list$"
|
||||
"|^vote$|^ad$|^Ad$|^rec$|^oneindia$|^inread$|^showmore$|^tags_scroll$""|^Share$|^date$|^related$|^fb-root$|^recommendation$|^recomment$"
|
||||
"|^readalso$|^read-also$|^image_counter$|^yarp$|^navig$|^extranews$""|^arrow$|^slider__footer$|^socbuttons$|^see-more$|^subscribe$"
|
||||
"|post-data|post-social|article__content__author-title|archive__posts__item""|user|^banner$|^flair$|^forlo$|append-news|^inject$|^rg-gallery_inj$|^tag$"
|
||||
)
|
||||
self.regexp_namespace = "http://exslt.org/regular-expressions"
|
||||
self.nauthy_ids_re = ("//*[re:test(@id, '%s', 'i')]" %
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ import re
|
|||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from difflib import SequenceMatcher
|
||||
from dateutil.parser import parse as date_parser
|
||||
from tldextract import tldextract
|
||||
from urllib.parse import urljoin, urlparse, urlunparse
|
||||
|
|
@ -344,11 +345,17 @@ class ContentExtractor(object):
|
|||
# (either it differs for case, for special chars, or it's truncated)
|
||||
# in these cases, we prefer the title_text_h1
|
||||
filter_title = filter_regex.sub('', title).lower()
|
||||
if filter_title_text_h1 == filter_title:
|
||||
if self.is_similar(filter_title_text_h1, filter_title):
|
||||
title = title_text_h1
|
||||
|
||||
return title
|
||||
|
||||
def is_similar(self, text_a, text_b):
|
||||
"""used for comparison between the final title and title_text_h1
|
||||
0.6 is an empirical value
|
||||
"""
|
||||
return SequenceMatcher(None, text_a, text_b).ratio() > 0.6
|
||||
|
||||
def split_title(self, title, splitter, hint=None):
|
||||
"""Split the title to best part possible
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue