mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
Differentiate strict and non-strict date regex filters/extractors. (#508)
* Differentiate strict and non-strict date regex filters/extractors. * Fix unit tests for date regex
This commit is contained in:
parent
8e7c1707e0
commit
17f06734bb
3 changed files with 6 additions and 4 deletions
|
|
@ -188,7 +188,7 @@ class ContentExtractor(object):
|
|||
# specifier, e.g. /2014/04/
|
||||
return None
|
||||
|
||||
date_match = re.search(urls.DATE_REGEX, url)
|
||||
date_match = re.search(urls.STRICT_DATE_REGEX, url)
|
||||
if date_match:
|
||||
date_str = date_match.group(0)
|
||||
datetime_obj = parse_date_str(date_str)
|
||||
|
|
|
|||
|
|
@ -20,7 +20,9 @@ log = logging.getLogger(__name__)
|
|||
|
||||
MAX_FILE_MEMO = 20000
|
||||
|
||||
DATE_REGEX = r'(?<=\W)([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?'
|
||||
_STRICT_DATE_REGEX_PREFIX = r'(?<=\W)'
|
||||
DATE_REGEX = r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?'
|
||||
STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX
|
||||
|
||||
ALLOWED_TYPES = ['html', 'htm', 'md', 'rst', 'aspx', 'jsp', 'rhtml', 'cgi',
|
||||
'xhtml', 'jhtml', 'asp']
|
||||
|
|
|
|||
4
tests/unit_tests.py
vendored
4
tests/unit_tests.py
vendored
|
|
@ -523,7 +523,7 @@ class UrlTestCase(unittest.TestCase):
|
|||
@print_test
|
||||
def test_pubdate(self):
|
||||
"""Checks that irrelevant data in url isn't considered as publishing date"""
|
||||
from newspaper.urls import DATE_REGEX
|
||||
from newspaper.urls import STRICT_DATE_REGEX
|
||||
|
||||
with open(os.path.join(TEST_DIR, 'data/test_urls_pubdate.txt'), 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
|
@ -533,7 +533,7 @@ class UrlTestCase(unittest.TestCase):
|
|||
|
||||
for pubdate, url in test_tuples:
|
||||
is_present = bool(int(pubdate))
|
||||
date_match = re.search(DATE_REGEX, url)
|
||||
date_match = re.search(STRICT_DATE_REGEX, url)
|
||||
try:
|
||||
self.assertEqual(is_present, bool(date_match))
|
||||
except AssertionError:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue