[3.13] gh-53203: Fix strptime() for %c, %x and %X formats on many locales (GH-125406) (GH-125454)

Fixed most locales that use non-ASCII digits, like Persian, Burmese,
Odia and Shan.
(cherry picked from commit 5f4e5b598c)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2024-10-14 19:45:26 +02:00 committed by GitHub
parent 7966c7d69e
commit cbcdf34a4b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 77 additions and 43 deletions

View file

@ -15,6 +15,7 @@ import time
import locale import locale
import calendar import calendar
from re import compile as re_compile from re import compile as re_compile
from re import sub as re_sub
from re import IGNORECASE from re import IGNORECASE
from re import escape as re_escape from re import escape as re_escape
from datetime import (date as datetime_date, from datetime import (date as datetime_date,
@ -133,7 +134,19 @@ class LocaleTime(object):
('44', '%M'), ('55', '%S'), ('76', '%j'), ('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'), ('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero. # '3' needed for when no leading zero.
('2', '%w'), ('10', '%I')] ('2', '%w'), ('10', '%I'),
# Non-ASCII digits
('\u0661\u0669\u0669\u0669', '%Y'),
('\u0669\u0669', '%Oy'),
('\u0662\u0662', '%OH'),
('\u0664\u0664', '%OM'),
('\u0665\u0665', '%OS'),
('\u0661\u0667', '%Od'),
('\u0660\u0663', '%Om'),
('\u0663', '%Om'),
('\u0662', '%Ow'),
('\u0661\u0660', '%OI'),
]
date_time = [] date_time = []
for directive in ('%c', '%x', '%X'): for directive in ('%c', '%x', '%X'):
current_format = time.strftime(directive, time_tuple).lower() current_format = time.strftime(directive, time_tuple).lower()
@ -158,6 +171,10 @@ class LocaleTime(object):
for tz in tz_values: for tz in tz_values:
if tz: if tz:
current_format = current_format.replace(tz, "%Z") current_format = current_format.replace(tz, "%Z")
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
current_format = re_sub(r'\d(?<![0-9])',
lambda m: chr(0x0660 + int(m[0])),
current_format)
for old, new in replacement_pairs: for old, new in replacement_pairs:
current_format = current_format.replace(old, new) current_format = current_format.replace(old, new)
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
@ -267,7 +284,7 @@ class TimeRE(dict):
else: else:
self.locale_time = LocaleTime() self.locale_time = LocaleTime()
base = super() base = super()
base.__init__({ mapping = {
# The " [1-9]" part of the regex is to make %c from ANSI C work # The " [1-9]" part of the regex is to make %c from ANSI C work
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", 'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})", 'f': r"(?P<f>[0-9]{1,6})",
@ -296,11 +313,15 @@ class TimeRE(dict):
'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone 'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
for tz in tz_names), for tz in tz_names),
'Z'), 'Z'),
'%': '%'}) '%': '%'}
base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) for d in 'dmyHIMS':
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
base.__setitem__('x', self.pattern(self.locale_time.LC_date)) mapping['Ow'] = r'(?P<w>\d)'
mapping['W'] = mapping['U'].replace('U', 'W')
base.__init__(mapping)
base.__setitem__('X', self.pattern(self.locale_time.LC_time)) base.__setitem__('X', self.pattern(self.locale_time.LC_time))
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
def __seqToRE(self, to_convert, directive): def __seqToRE(self, to_convert, directive):
"""Convert a list to a regex string for matching a directive. """Convert a list to a regex string for matching a directive.
@ -328,28 +349,25 @@ class TimeRE(dict):
regex syntax are escaped. regex syntax are escaped.
""" """
processed_format = ''
# The sub() call escapes all characters that might be misconstrued # The sub() call escapes all characters that might be misconstrued
# as regex syntax. Cannot use re.escape since we have to deal with # as regex syntax. Cannot use re.escape since we have to deal with
# format directives (%m, etc.). # format directives (%m, etc.).
regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") format = re_sub(r"([\\.^$*+?\(\){}\[\]|])", r"\\\1", format)
format = regex_chars.sub(r"\\\1", format) format = re_sub(r'\s+', r'\\s+', format)
whitespace_replacement = re_compile(r'\s+') format = re_sub(r"'", "['\u02bc]", format) # needed for br_FR
format = whitespace_replacement.sub(r'\\s+', format)
year_in_format = False year_in_format = False
day_of_month_in_format = False day_of_month_in_format = False
while '%' in format: def repl(m):
directive_index = format.index('%')+1 format_char = m[1]
format_char = format[directive_index]
processed_format = "%s%s%s" % (processed_format,
format[:directive_index-1],
self[format_char])
format = format[directive_index+1:]
match format_char: match format_char:
case 'Y' | 'y' | 'G': case 'Y' | 'y' | 'G':
nonlocal year_in_format
year_in_format = True year_in_format = True
case 'd': case 'd':
nonlocal day_of_month_in_format
day_of_month_in_format = True day_of_month_in_format = True
return self[format_char]
format = re_sub(r'%(O?.)', repl, format)
if day_of_month_in_format and not year_in_format: if day_of_month_in_format and not year_in_format:
import warnings import warnings
warnings.warn("""\ warnings.warn("""\
@ -360,7 +378,7 @@ To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.""", See https://github.com/python/cpython/issues/70647.""",
DeprecationWarning, DeprecationWarning,
skip_file_prefixes=(os.path.dirname(__file__),)) skip_file_prefixes=(os.path.dirname(__file__),))
return "%s%s" % (processed_format, format) return format
def compile(self, format): def compile(self, format):
"""Return a compiled re object for the format string.""" """Return a compiled re object for the format string."""
@ -434,8 +452,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
_regex_cache[format] = format_regex _regex_cache[format] = format_regex
found = format_regex.match(data_string) found = format_regex.match(data_string)
if not found: if not found:
raise ValueError("time data %r does not match format %r :: /%s/" % raise ValueError("time data %r does not match format %r" %
(data_string, format, format_regex.pattern)) (data_string, format))
if len(data_string) != found.end(): if len(data_string) != found.end():
raise ValueError("unconverted data remains: %s" % raise ValueError("unconverted data remains: %s" %
data_string[found.end():]) data_string[found.end():])

View file

@ -292,7 +292,7 @@ class StrptimeTests(unittest.TestCase):
# additional check for IndexError branch (issue #19545) # additional check for IndexError branch (issue #19545)
with self.assertRaises(ValueError) as e: with self.assertRaises(ValueError) as e:
_strptime._strptime_time('19', '%Y %') _strptime._strptime_time('19', '%Y %')
self.assertIs(e.exception.__suppress_context__, True) self.assertIsNone(e.exception.__context__)
def test_unconverteddata(self): def test_unconverteddata(self):
# Check ValueError is raised when there is unconverted data # Check ValueError is raised when there is unconverted data
@ -485,12 +485,14 @@ class StrptimeTests(unittest.TestCase):
# id_ID, ms_MY. # id_ID, ms_MY.
# * Year is not included: ha_NG. # * Year is not included: ha_NG.
# * Use non-Gregorian calendar: lo_LA, thai, th_TH. # * Use non-Gregorian calendar: lo_LA, thai, th_TH.
# On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
# #
# BUG: Generates regexp that does not match the current date and time # BUG: Generates regexp that does not match the current date and time
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM. # for lzh_TW.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG', 'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN') 'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
'my_MM', 'or_IN', 'shn_MM', 'az_IR')
def test_date_time_locale(self): def test_date_time_locale(self):
# Test %c directive # Test %c directive
loc = locale.getlocale(locale.LC_TIME)[0] loc = locale.getlocale(locale.LC_TIME)[0]
@ -512,20 +514,23 @@ class StrptimeTests(unittest.TestCase):
self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600)) self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600))
# NB: Dates before 1969 do not roundtrip on some locales: # NB: Dates before 1969 do not roundtrip on some locales:
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR. # az_IR, bo_CN, bo_IN, dz_BT, eu_ES, eu_FR, fa_IR, or_IN.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG', 'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN') 'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
'my_MM', 'shn_MM')
def test_date_time_locale2(self): def test_date_time_locale2(self):
# Test %c directive # Test %c directive
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0)) self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
self.roundtrip('%c', slice(0, 6), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
# NB: Does not roundtrip because use non-Gregorian calendar: # NB: Does not roundtrip because use non-Gregorian calendar:
# lo_LA, thai, th_TH. # lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
# BUG: Generates regexp that does not match the current date # BUG: Generates regexp that does not match the current date
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM. # for lzh_TW.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE') 'he_IL', 'eu_ES', 'ar_AE',
'az_IR', 'my_MM', 'or_IN', 'shn_MM')
def test_date_locale(self): def test_date_locale(self):
# Test %x directive # Test %x directive
now = time.time() now = time.time()
@ -545,10 +550,11 @@ class StrptimeTests(unittest.TestCase):
"musl libc issue on Emscripten, bpo-46390" "musl libc issue on Emscripten, bpo-46390"
) )
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP', @run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'eu_ES', 'ar_AE') 'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
def test_date_locale2(self): def test_date_locale2(self):
# Test %x directive # Test %x directive
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0)) self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
self.roundtrip('%x', slice(0, 3), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
# NB: Does not roundtrip in some locales due to the ambiguity of # NB: Does not roundtrip in some locales due to the ambiguity of
# the time representation (bugs in locales?): # the time representation (bugs in locales?):
@ -556,19 +562,27 @@ class StrptimeTests(unittest.TestCase):
# norwegian, nynorsk. # norwegian, nynorsk.
# * Hours are in 12-hour notation without AM/PM indication: hy_AM, # * Hours are in 12-hour notation without AM/PM indication: hy_AM,
# ms_MY, sm_WS. # ms_MY, sm_WS.
# BUG: Generates regexp that does not match the current time for # BUG: Generates regexp that does not match the current time for lzh_TW.
# aa_DJ, aa_ER, aa_ET, am_ET, az_IR, byn_ER, fa_IR, gez_ER, gez_ET, @run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
# lzh_TW, my_MM, om_ET, om_KE, or_IN, shn_MM, sid_ET, so_DJ, so_ET, 'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
# so_SO, ti_ER, ti_ET, tig_ER, wal_ET. 'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP') 'ti_ET', 'tig_ER', 'wal_ET')
def test_time_locale(self): def test_time_locale(self):
# Test %X directive # Test %X directive
loc = locale.getlocale(locale.LC_TIME)[0]
pos = slice(3, 6)
if glibc_ver and glibc_ver < (2, 29) and loc in {
'aa_ET', 'am_ET', 'byn_ER', 'gez_ET', 'om_ET',
'sid_ET', 'so_SO', 'ti_ET', 'tig_ER', 'wal_ET'}:
# Hours are in 12-hour notation without AM/PM indication.
# Ignore hours.
pos = slice(4, 6)
now = time.time() now = time.time()
self.roundtrip('%X', slice(3, 6), time.localtime(now)) self.roundtrip('%X', pos, time.localtime(now))
# 1 hour 20 minutes 30 seconds ago # 1 hour 20 minutes 30 seconds ago
self.roundtrip('%X', slice(3, 6), time.localtime(now - 4830)) self.roundtrip('%X', pos, time.localtime(now - 4830))
# 12 hours ago # 12 hours ago
self.roundtrip('%X', slice(3, 6), time.localtime(now - 12*3600)) self.roundtrip('%X', pos, time.localtime(now - 12*3600))
def test_percent(self): def test_percent(self):
# Make sure % signs are handled properly # Make sure % signs are handled properly

View file

@ -298,7 +298,7 @@ class TimeTestCase(unittest.TestCase):
# additional check for IndexError branch (issue #19545) # additional check for IndexError branch (issue #19545)
with self.assertRaises(ValueError) as e: with self.assertRaises(ValueError) as e:
time.strptime('19', '%Y %') time.strptime('19', '%Y %')
self.assertIs(e.exception.__suppress_context__, True) self.assertIsNone(e.exception.__context__)
def test_strptime_leap_year(self): def test_strptime_leap_year(self):
# GH-70647: warns if parsing a format with a day and no year. # GH-70647: warns if parsing a format with a day and no year.

View file

@ -0,0 +1,2 @@
Fix :func:`time.strptime` for ``%c``, ``%x`` and ``%X`` formats in many
locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan.