This commit is contained in:
Eyal Strassburg 2025-11-04 16:02:37 +01:00 committed by GitHub
commit 0c65f7b4fa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 5 additions and 0 deletions

View file

@ -29,6 +29,9 @@ class DocumentCleaner(object):
"|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
"|legende|ajoutVideo|timestamp|js_replies"
)
# enable adding additional remove patterns through the config object
if self.config.additional_remove_nodes_re:
self.remove_nodes_re += '|' + self.config.additional_remove_nodes_re
self.regexp_namespace = "http://exslt.org/regular-expressions"
self.nauthy_ids_re = ("//*[re:test(@id, '%s', 'i')]" %
self.remove_nodes_re)

View file

@ -73,6 +73,8 @@ class Configuration(object):
self.thread_timeout_seconds = 1
self.ignored_content_types_defaults = {}
self.additional_remove_nodes_re = None
# Set this to False if you want to recompute the categories
# *every* time you build a `Source` object
# TODO: Actually make this work