mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
gh-53203: Fix strptime() for %c, %x and %X formats on some locales (#135971)
* Add detection of decimal non-ASCII alt digits. * Add support of non-decimal alt digits on locale lzh_TW. * Accept only numbers in correct range if alt digits are known. * Fix bug in detecting the position of the week day name on locales byn_ER and wal_ET. * Fix support of single-digit hour on locales ar_SA and bg_BG. * Add support for %T, %R, %r, %C, %OC. * Prepare code to use nl_langinfo().
This commit is contained in:
parent
0c6c09b737
commit
07183ebce3
3 changed files with 164 additions and 56 deletions
190
Lib/_strptime.py
190
Lib/_strptime.py
|
@ -14,6 +14,7 @@ import os
|
|||
import time
|
||||
import locale
|
||||
import calendar
|
||||
import re
|
||||
from re import compile as re_compile
|
||||
from re import sub as re_sub
|
||||
from re import IGNORECASE
|
||||
|
@ -41,6 +42,21 @@ def _findall(haystack, needle):
|
|||
yield i
|
||||
i += len(needle)
|
||||
|
||||
|
||||
lzh_TW_alt_digits = (
|
||||
# 〇:一:二:三:四:五:六:七:八:九
|
||||
'\u3007', '\u4e00', '\u4e8c', '\u4e09', '\u56db',
|
||||
'\u4e94', '\u516d', '\u4e03', '\u516b', '\u4e5d',
|
||||
# 十:十一:十二:十三:十四:十五:十六:十七:十八:十九
|
||||
'\u5341', '\u5341\u4e00', '\u5341\u4e8c', '\u5341\u4e09', '\u5341\u56db',
|
||||
'\u5341\u4e94', '\u5341\u516d', '\u5341\u4e03', '\u5341\u516b', '\u5341\u4e5d',
|
||||
# 廿:廿一:廿二:廿三:廿四:廿五:廿六:廿七:廿八:廿九
|
||||
'\u5eff', '\u5eff\u4e00', '\u5eff\u4e8c', '\u5eff\u4e09', '\u5eff\u56db',
|
||||
'\u5eff\u4e94', '\u5eff\u516d', '\u5eff\u4e03', '\u5eff\u516b', '\u5eff\u4e5d',
|
||||
# 卅:卅一
|
||||
'\u5345', '\u5345\u4e00')
|
||||
|
||||
|
||||
class LocaleTime(object):
|
||||
"""Stores and handles locale-specific information related to time.
|
||||
|
||||
|
@ -84,6 +100,7 @@ class LocaleTime(object):
|
|||
self.__calc_weekday()
|
||||
self.__calc_month()
|
||||
self.__calc_am_pm()
|
||||
self.__calc_alt_digits()
|
||||
self.__calc_timezone()
|
||||
self.__calc_date_time()
|
||||
if _getlang() != self.lang:
|
||||
|
@ -119,9 +136,43 @@ class LocaleTime(object):
|
|||
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
|
||||
self.am_pm = am_pm
|
||||
|
||||
def __calc_alt_digits(self):
|
||||
# Set self.LC_alt_digits by using time.strftime().
|
||||
|
||||
# The magic data should contain all decimal digits.
|
||||
time_tuple = time.struct_time((1998, 1, 27, 10, 43, 56, 1, 27, 0))
|
||||
s = time.strftime("%x%X", time_tuple)
|
||||
if s.isascii():
|
||||
# Fast path -- all digits are ASCII.
|
||||
self.LC_alt_digits = ()
|
||||
return
|
||||
|
||||
digits = ''.join(sorted(set(re.findall(r'\d', s))))
|
||||
if len(digits) == 10 and ord(digits[-1]) == ord(digits[0]) + 9:
|
||||
# All 10 decimal digits from the same set.
|
||||
if digits.isascii():
|
||||
# All digits are ASCII.
|
||||
self.LC_alt_digits = ()
|
||||
return
|
||||
|
||||
self.LC_alt_digits = [a + b for a in digits for b in digits]
|
||||
# Test whether the numbers contain leading zero.
|
||||
time_tuple2 = time.struct_time((2000, 1, 1, 1, 1, 1, 5, 1, 0))
|
||||
if self.LC_alt_digits[1] not in time.strftime("%x %X", time_tuple2):
|
||||
self.LC_alt_digits[:10] = digits
|
||||
return
|
||||
|
||||
# Either non-Gregorian calendar or non-decimal numbers.
|
||||
if {'\u4e00', '\u4e03', '\u4e5d', '\u5341', '\u5eff'}.issubset(s):
|
||||
# lzh_TW
|
||||
self.LC_alt_digits = lzh_TW_alt_digits
|
||||
return
|
||||
|
||||
self.LC_alt_digits = None
|
||||
|
||||
def __calc_date_time(self):
|
||||
# Set self.date_time, self.date, & self.time by using
|
||||
# time.strftime().
|
||||
# Set self.LC_date_time, self.LC_date, self.LC_time and
|
||||
# self.LC_time_ampm by using time.strftime().
|
||||
|
||||
# Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
|
||||
# overloaded numbers is minimized. The order in which searches for
|
||||
|
@ -129,26 +180,32 @@ class LocaleTime(object):
|
|||
# possible ambiguity for what something represents.
|
||||
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
|
||||
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
|
||||
replacement_pairs = [
|
||||
replacement_pairs = []
|
||||
|
||||
# Non-ASCII digits
|
||||
if self.LC_alt_digits or self.LC_alt_digits is None:
|
||||
for n, d in [(19, '%OC'), (99, '%Oy'), (22, '%OH'),
|
||||
(44, '%OM'), (55, '%OS'), (17, '%Od'),
|
||||
(3, '%Om'), (2, '%Ow'), (10, '%OI')]:
|
||||
if self.LC_alt_digits is None:
|
||||
s = chr(0x660 + n // 10) + chr(0x660 + n % 10)
|
||||
replacement_pairs.append((s, d))
|
||||
if n < 10:
|
||||
replacement_pairs.append((s[1], d))
|
||||
elif len(self.LC_alt_digits) > n:
|
||||
replacement_pairs.append((self.LC_alt_digits[n], d))
|
||||
else:
|
||||
replacement_pairs.append((time.strftime(d, time_tuple), d))
|
||||
replacement_pairs += [
|
||||
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
|
||||
('44', '%M'), ('55', '%S'), ('76', '%j'),
|
||||
('17', '%d'), ('03', '%m'), ('3', '%m'),
|
||||
# '3' needed for when no leading zero.
|
||||
('2', '%w'), ('10', '%I'),
|
||||
# Non-ASCII digits
|
||||
('\u0661\u0669\u0669\u0669', '%Y'),
|
||||
('\u0669\u0669', '%Oy'),
|
||||
('\u0662\u0662', '%OH'),
|
||||
('\u0664\u0664', '%OM'),
|
||||
('\u0665\u0665', '%OS'),
|
||||
('\u0661\u0667', '%Od'),
|
||||
('\u0660\u0663', '%Om'),
|
||||
('\u0663', '%Om'),
|
||||
('\u0662', '%Ow'),
|
||||
('\u0661\u0660', '%OI'),
|
||||
]
|
||||
|
||||
date_time = []
|
||||
for directive in ('%c', '%x', '%X'):
|
||||
for directive in ('%c', '%x', '%X', '%r'):
|
||||
current_format = time.strftime(directive, time_tuple).lower()
|
||||
current_format = current_format.replace('%', '%%')
|
||||
# The month and the day of the week formats are treated specially
|
||||
|
@ -172,9 +229,10 @@ class LocaleTime(object):
|
|||
if tz:
|
||||
current_format = current_format.replace(tz, "%Z")
|
||||
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
|
||||
current_format = re_sub(r'\d(?<![0-9])',
|
||||
lambda m: chr(0x0660 + int(m[0])),
|
||||
current_format)
|
||||
if not current_format.isascii() and self.LC_alt_digits is None:
|
||||
current_format = re_sub(r'\d(?<![0-9])',
|
||||
lambda m: chr(0x0660 + int(m[0])),
|
||||
current_format)
|
||||
for old, new in replacement_pairs:
|
||||
current_format = current_format.replace(old, new)
|
||||
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
|
||||
|
@ -189,6 +247,7 @@ class LocaleTime(object):
|
|||
self.LC_date_time = date_time[0]
|
||||
self.LC_date = date_time[1]
|
||||
self.LC_time = date_time[2]
|
||||
self.LC_time_ampm = date_time[3]
|
||||
|
||||
def __find_month_format(self, directive):
|
||||
"""Find the month format appropriate for the current locale.
|
||||
|
@ -213,7 +272,7 @@ class LocaleTime(object):
|
|||
full_indices &= indices
|
||||
indices = set(_findall(datetime, self.a_month[m]))
|
||||
if abbr_indices is None:
|
||||
abbr_indices = indices
|
||||
abbr_indices = set(indices)
|
||||
else:
|
||||
abbr_indices &= indices
|
||||
if not full_indices and not abbr_indices:
|
||||
|
@ -241,7 +300,7 @@ class LocaleTime(object):
|
|||
if self.f_weekday[wd] != self.a_weekday[wd]:
|
||||
indices = set(_findall(datetime, self.a_weekday[wd]))
|
||||
if abbr_indices is None:
|
||||
abbr_indices = indices
|
||||
abbr_indices = set(indices)
|
||||
else:
|
||||
abbr_indices &= indices
|
||||
if not full_indices and not abbr_indices:
|
||||
|
@ -288,8 +347,10 @@ class TimeRE(dict):
|
|||
# The " [1-9]" part of the regex is to make %c from ANSI C work
|
||||
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
|
||||
'f': r"(?P<f>[0-9]{1,6})",
|
||||
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
|
||||
'H': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
|
||||
'k': r"(?P<H>2[0-3]|[0-1]\d|\d| \d)",
|
||||
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
|
||||
'l': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
|
||||
'G': r"(?P<G>\d\d\d\d)",
|
||||
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
|
||||
'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
|
||||
|
@ -312,16 +373,49 @@ class TimeRE(dict):
|
|||
for tz in tz_names),
|
||||
'Z'),
|
||||
'%': '%'}
|
||||
for d in 'dmyHIMS':
|
||||
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
|
||||
mapping['Ow'] = r'(?P<w>\d)'
|
||||
if self.locale_time.LC_alt_digits is None:
|
||||
for d in 'dmyCHIMS':
|
||||
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
|
||||
mapping['Ow'] = r'(?P<w>\d)'
|
||||
else:
|
||||
mapping.update({
|
||||
'Od': self.__seqToRE(self.locale_time.LC_alt_digits[1:32], 'd',
|
||||
'3[0-1]|[1-2][0-9]|0[1-9]|[1-9]'),
|
||||
'Om': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'm',
|
||||
'1[0-2]|0[1-9]|[1-9]'),
|
||||
'Ow': self.__seqToRE(self.locale_time.LC_alt_digits[:7], 'w',
|
||||
'[0-6]'),
|
||||
'Oy': self.__seqToRE(self.locale_time.LC_alt_digits, 'y',
|
||||
'[0-9][0-9]'),
|
||||
'OC': self.__seqToRE(self.locale_time.LC_alt_digits, 'C',
|
||||
'[0-9][0-9]'),
|
||||
'OH': self.__seqToRE(self.locale_time.LC_alt_digits[:24], 'H',
|
||||
'2[0-3]|[0-1][0-9]|[0-9]'),
|
||||
'OI': self.__seqToRE(self.locale_time.LC_alt_digits[1:13], 'I',
|
||||
'1[0-2]|0[1-9]|[1-9]'),
|
||||
'OM': self.__seqToRE(self.locale_time.LC_alt_digits[:60], 'M',
|
||||
'[0-5][0-9]|[0-9]'),
|
||||
'OS': self.__seqToRE(self.locale_time.LC_alt_digits[:62], 'S',
|
||||
'6[0-1]|[0-5][0-9]|[0-9]'),
|
||||
})
|
||||
mapping.update({
|
||||
'e': mapping['d'],
|
||||
'Oe': mapping['Od'],
|
||||
'P': mapping['p'],
|
||||
'Op': mapping['p'],
|
||||
'W': mapping['U'].replace('U', 'W'),
|
||||
})
|
||||
mapping['W'] = mapping['U'].replace('U', 'W')
|
||||
|
||||
base.__init__(mapping)
|
||||
base.__setitem__('T', self.pattern('%H:%M:%S'))
|
||||
base.__setitem__('R', self.pattern('%H:%M'))
|
||||
base.__setitem__('r', self.pattern(self.locale_time.LC_time_ampm))
|
||||
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
|
||||
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
|
||||
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
|
||||
|
||||
def __seqToRE(self, to_convert, directive):
|
||||
def __seqToRE(self, to_convert, directive, altregex=None):
|
||||
"""Convert a list to a regex string for matching a directive.
|
||||
|
||||
Want possible matching values to be from longest to shortest. This
|
||||
|
@ -337,8 +431,9 @@ class TimeRE(dict):
|
|||
else:
|
||||
return ''
|
||||
regex = '|'.join(re_escape(stuff) for stuff in to_convert)
|
||||
regex = '(?P<%s>%s' % (directive, regex)
|
||||
return '%s)' % regex
|
||||
if altregex is not None:
|
||||
regex += '|' + altregex
|
||||
return '(?P<%s>%s)' % (directive, regex)
|
||||
|
||||
def pattern(self, format):
|
||||
"""Return regex pattern for the format string.
|
||||
|
@ -365,7 +460,7 @@ class TimeRE(dict):
|
|||
nonlocal day_of_month_in_format
|
||||
day_of_month_in_format = True
|
||||
return self[format_char]
|
||||
format = re_sub(r'%([OE]?\\?.?)', repl, format)
|
||||
format = re_sub(r'%[-_0^#]*[0-9]*([OE]?\\?.?)', repl, format)
|
||||
if day_of_month_in_format and not year_in_format:
|
||||
import warnings
|
||||
warnings.warn("""\
|
||||
|
@ -467,6 +562,15 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
|
|||
# values
|
||||
weekday = julian = None
|
||||
found_dict = found.groupdict()
|
||||
if locale_time.LC_alt_digits:
|
||||
def parse_int(s):
|
||||
try:
|
||||
return locale_time.LC_alt_digits.index(s)
|
||||
except ValueError:
|
||||
return int(s)
|
||||
else:
|
||||
parse_int = int
|
||||
|
||||
for group_key in found_dict.keys():
|
||||
# Directives not explicitly handled below:
|
||||
# c, x, X
|
||||
|
@ -474,30 +578,34 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
|
|||
# U, W
|
||||
# worthless without day of the week
|
||||
if group_key == 'y':
|
||||
year = int(found_dict['y'])
|
||||
# Open Group specification for strptime() states that a %y
|
||||
#value in the range of [00, 68] is in the century 2000, while
|
||||
#[69,99] is in the century 1900
|
||||
if year <= 68:
|
||||
year += 2000
|
||||
year = parse_int(found_dict['y'])
|
||||
if 'C' in found_dict:
|
||||
century = parse_int(found_dict['C'])
|
||||
year += century * 100
|
||||
else:
|
||||
year += 1900
|
||||
# Open Group specification for strptime() states that a %y
|
||||
#value in the range of [00, 68] is in the century 2000, while
|
||||
#[69,99] is in the century 1900
|
||||
if year <= 68:
|
||||
year += 2000
|
||||
else:
|
||||
year += 1900
|
||||
elif group_key == 'Y':
|
||||
year = int(found_dict['Y'])
|
||||
elif group_key == 'G':
|
||||
iso_year = int(found_dict['G'])
|
||||
elif group_key == 'm':
|
||||
month = int(found_dict['m'])
|
||||
month = parse_int(found_dict['m'])
|
||||
elif group_key == 'B':
|
||||
month = locale_time.f_month.index(found_dict['B'].lower())
|
||||
elif group_key == 'b':
|
||||
month = locale_time.a_month.index(found_dict['b'].lower())
|
||||
elif group_key == 'd':
|
||||
day = int(found_dict['d'])
|
||||
day = parse_int(found_dict['d'])
|
||||
elif group_key == 'H':
|
||||
hour = int(found_dict['H'])
|
||||
hour = parse_int(found_dict['H'])
|
||||
elif group_key == 'I':
|
||||
hour = int(found_dict['I'])
|
||||
hour = parse_int(found_dict['I'])
|
||||
ampm = found_dict.get('p', '').lower()
|
||||
# If there was no AM/PM indicator, we'll treat this like AM
|
||||
if ampm in ('', locale_time.am_pm[0]):
|
||||
|
@ -513,9 +621,9 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
|
|||
if hour != 12:
|
||||
hour += 12
|
||||
elif group_key == 'M':
|
||||
minute = int(found_dict['M'])
|
||||
minute = parse_int(found_dict['M'])
|
||||
elif group_key == 'S':
|
||||
second = int(found_dict['S'])
|
||||
second = parse_int(found_dict['S'])
|
||||
elif group_key == 'f':
|
||||
s = found_dict['f']
|
||||
# Pad to always return microseconds.
|
||||
|
|
|
@ -221,14 +221,16 @@ class StrptimeTests(unittest.TestCase):
|
|||
self.assertRaises(ValueError, _strptime._strptime_time, data_string="%d",
|
||||
format="%A")
|
||||
for bad_format in ("%", "% ", "%\n"):
|
||||
with self.assertRaisesRegex(ValueError, "stray % in format "):
|
||||
with (self.subTest(format=bad_format),
|
||||
self.assertRaisesRegex(ValueError, "stray % in format ")):
|
||||
_strptime._strptime_time("2005", bad_format)
|
||||
for bad_format in ("%e", "%Oe", "%O", "%O ", "%Ee", "%E", "%E ",
|
||||
"%.", "%+", "%_", "%~", "%\\",
|
||||
for bad_format in ("%i", "%Oi", "%O", "%O ", "%Ee", "%E", "%E ",
|
||||
"%.", "%+", "%~", "%\\",
|
||||
"%O.", "%O+", "%O_", "%O~", "%O\\"):
|
||||
directive = bad_format[1:].rstrip()
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
f"'{re.escape(directive)}' is a bad directive in format "):
|
||||
with (self.subTest(format=bad_format),
|
||||
self.assertRaisesRegex(ValueError,
|
||||
f"'{re.escape(directive)}' is a bad directive in format ")):
|
||||
_strptime._strptime_time("2005", bad_format)
|
||||
|
||||
msg_week_no_year_or_weekday = r"ISO week directive '%V' must be used with " \
|
||||
|
@ -480,13 +482,11 @@ class StrptimeTests(unittest.TestCase):
|
|||
# * Year is not included: ha_NG.
|
||||
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
|
||||
# On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
|
||||
#
|
||||
# BUG: Generates regexp that does not match the current date and time
|
||||
# for lzh_TW.
|
||||
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
|
||||
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
|
||||
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
|
||||
'my_MM', 'or_IN', 'shn_MM', 'az_IR')
|
||||
'my_MM', 'or_IN', 'shn_MM', 'az_IR',
|
||||
'byn_ER', 'wal_ET', 'lzh_TW')
|
||||
def test_date_time_locale(self):
|
||||
# Test %c directive
|
||||
loc = locale.getlocale(locale.LC_TIME)[0]
|
||||
|
@ -525,11 +525,9 @@ class StrptimeTests(unittest.TestCase):
|
|||
|
||||
# NB: Does not roundtrip because use non-Gregorian calendar:
|
||||
# lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
|
||||
# BUG: Generates regexp that does not match the current date
|
||||
# for lzh_TW.
|
||||
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
|
||||
'he_IL', 'eu_ES', 'ar_AE',
|
||||
'az_IR', 'my_MM', 'or_IN', 'shn_MM')
|
||||
'az_IR', 'my_MM', 'or_IN', 'shn_MM', 'lzh_TW')
|
||||
def test_date_locale(self):
|
||||
# Test %x directive
|
||||
now = time.time()
|
||||
|
@ -546,7 +544,7 @@ class StrptimeTests(unittest.TestCase):
|
|||
# NB: Dates before 1969 do not roundtrip on many locales, including C.
|
||||
@unittest.skipIf(support.linked_to_musl(), "musl libc issue, bpo-46390")
|
||||
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
|
||||
'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
|
||||
'eu_ES', 'ar_AE', 'my_MM', 'shn_MM', 'lzh_TW')
|
||||
def test_date_locale2(self):
|
||||
# Test %x directive
|
||||
loc = locale.getlocale(locale.LC_TIME)[0]
|
||||
|
@ -562,11 +560,11 @@ class StrptimeTests(unittest.TestCase):
|
|||
# norwegian, nynorsk.
|
||||
# * Hours are in 12-hour notation without AM/PM indication: hy_AM,
|
||||
# ms_MY, sm_WS.
|
||||
# BUG: Generates regexp that does not match the current time for lzh_TW.
|
||||
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
|
||||
'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
|
||||
'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
|
||||
'ti_ET', 'tig_ER', 'wal_ET')
|
||||
'ti_ET', 'tig_ER', 'wal_ET', 'lzh_TW',
|
||||
'ar_SA', 'bg_BG')
|
||||
def test_time_locale(self):
|
||||
# Test %X directive
|
||||
loc = locale.getlocale(locale.LC_TIME)[0]
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fix :func:`time.strptime` for ``%c`` and ``%x`` formats on locales byn_ER,
|
||||
wal_ET and lzh_TW, and for ``%X`` format on locales ar_SA, bg_BG and lzh_TW.
|
Loading…
Add table
Add a link
Reference in a new issue