diff --git a/newspaper/cleaners.py b/newspaper/cleaners.py index 47b6f1a..3389940 100644 --- a/newspaper/cleaners.py +++ b/newspaper/cleaners.py @@ -28,6 +28,12 @@ class DocumentCleaner(object): "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies" + "|date|^print$|popup|author-dropdown|tools|socialtools|byline" + "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" + "|legende|ajoutVideo|timestamp|js_replies|breadcrumb|^rating$""|^comment$|^share$|^like$|^icon$|^count$|^sharing$|^news-list$" + "|^vote$|^ad$|^Ad$|^rec$|^oneindia$|^inread$|^showmore$|^tags_scroll$""|^Share$|^date$|^related$|^fb-root$|^recommendation$|^recomment$" + "|^readalso$|^read-also$|^image_counter$|^yarp$|^navig$|^extranews$""|^arrow$|^slider__footer$|^socbuttons$|^see-more$|^subscribe$" + "|post-data|post-social|article__content__author-title|archive__posts__item""|user|^banner$|^flair$|^forlo$|append-news|^inject$|^rg-gallery_inj$|^tag$" ) self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = ("//*[re:test(@id, '%s', 'i')]" % diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 9625540..6db20e9 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -17,6 +17,7 @@ import re import re from collections import defaultdict +from difflib import SequenceMatcher from dateutil.parser import parse as date_parser from tldextract import tldextract from urllib.parse import urljoin, urlparse, urlunparse @@ -344,11 +345,17 @@ class ContentExtractor(object): # (either it differs for case, for special chars, or it's truncated) # in these cases, we prefer the title_text_h1 filter_title = filter_regex.sub('', title).lower() - if filter_title_text_h1 == filter_title: + if self.is_similar(filter_title_text_h1, filter_title): title = title_text_h1 return title + def is_similar(self, text_a, text_b): + """used for comparison between the final title and title_text_h1 + 0.6 is an empirical value + """ + return SequenceMatcher(None, text_a, text_b).ratio() > 0.6 + def split_title(self, title, splitter, hint=None): """Split the title to best part possible """