Differentiate strict and non-strict date regex filters/extractors. (#508)

* Differentiate strict and non-strict date regex filters/extractors.

* Fix unit tests for date regex
This commit is contained in:
Lucas Ou-Yang 2018-01-22 10:17:07 -08:00 committed by GitHub
parent 8e7c1707e0
commit 17f06734bb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 6 additions and 4 deletions

View file

@ -188,7 +188,7 @@ class ContentExtractor(object):
# specifier, e.g. /2014/04/
return None
date_match = re.search(urls.DATE_REGEX, url)
date_match = re.search(urls.STRICT_DATE_REGEX, url)
if date_match:
date_str = date_match.group(0)
datetime_obj = parse_date_str(date_str)

View file

@ -20,7 +20,9 @@ log = logging.getLogger(__name__)
MAX_FILE_MEMO = 20000
DATE_REGEX = r'(?<=\W)([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?'
_STRICT_DATE_REGEX_PREFIX = r'(?<=\W)'
DATE_REGEX = r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?'
STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX
ALLOWED_TYPES = ['html', 'htm', 'md', 'rst', 'aspx', 'jsp', 'rhtml', 'cgi',
'xhtml', 'jhtml', 'asp']

4
tests/unit_tests.py vendored
View file

@ -523,7 +523,7 @@ class UrlTestCase(unittest.TestCase):
@print_test
def test_pubdate(self):
"""Checks that irrelevant data in url isn't considered as publishing date"""
from newspaper.urls import DATE_REGEX
from newspaper.urls import STRICT_DATE_REGEX
with open(os.path.join(TEST_DIR, 'data/test_urls_pubdate.txt'), 'r') as f:
lines = f.readlines()
@ -533,7 +533,7 @@ class UrlTestCase(unittest.TestCase):
for pubdate, url in test_tuples:
is_present = bool(int(pubdate))
date_match = re.search(DATE_REGEX, url)
date_match = re.search(STRICT_DATE_REGEX, url)
try:
self.assertEqual(is_present, bool(date_match))
except AssertionError: