bpo-33189: pygettext.py now accepts only literal strings (GH-6364)

as docstrings and translatable strings, and rejects bytes literals and f-string expressions. (cherry picked from commit 69524821a8) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-09-30 20:31:52 +00:00 · 2018-04-18 23:49:15 -07:00 · 2018-04-18 23:49:15 -07:00 · a4fb580f70
commit a4fb580f70
parent 198c0c0509
3 changed files with 76 additions and 11 deletions
--- a/Lib/test/test_tools/test_i18n.py
+++ b/Lib/test/test_tools/test_i18n.py
@ -3,7 +3,7 @@
 import os
 import sys
 import unittest
-import textwrap
+from textwrap import dedent
 from test.support.script_helper import assert_python_ok
 from test.test_tools import skip_if_missing, toolsdir
@ -109,9 +109,68 @@ class Test_pygettext(unittest.TestCase):
            # This will raise if the date format does not exactly match.
            datetime.strptime(creationDate, '%Y-%m-%d %H:%M%z')
    def test_funcdocstring(self):
        for doc in ('"""doc"""', "r'''doc'''", "R'doc'", 'u"doc"'):
            with self.subTest(doc):
                msgids = self.extract_docstrings_from_str(dedent('''\
                def foo(bar):
                    %s
                ''' % doc))
                self.assertIn('doc', msgids)
    def test_funcdocstring_bytes(self):
        msgids = self.extract_docstrings_from_str(dedent('''\
        def foo(bar):
            b"""doc"""
        '''))
        self.assertFalse([msgid for msgid in msgids if 'doc' in msgid])
    def test_funcdocstring_fstring(self):
        msgids = self.extract_docstrings_from_str(dedent('''\
        def foo(bar):
            f"""doc"""
        '''))
        self.assertFalse([msgid for msgid in msgids if 'doc' in msgid])
    def test_classdocstring(self):
        for doc in ('"""doc"""', "r'''doc'''", "R'doc'", 'u"doc"'):
            with self.subTest(doc):
                msgids = self.extract_docstrings_from_str(dedent('''\
                class C:
                    %s
                ''' % doc))
                self.assertIn('doc', msgids)
    def test_classdocstring_bytes(self):
        msgids = self.extract_docstrings_from_str(dedent('''\
        class C:
            b"""doc"""
        '''))
        self.assertFalse([msgid for msgid in msgids if 'doc' in msgid])
    def test_classdocstring_fstring(self):
        msgids = self.extract_docstrings_from_str(dedent('''\
        class C:
            f"""doc"""
        '''))
        self.assertFalse([msgid for msgid in msgids if 'doc' in msgid])
    def test_msgid(self):
        msgids = self.extract_docstrings_from_str(
                '''_("""doc""" r'str' u"ing")''')
        self.assertIn('docstring', msgids)
    def test_msgid_bytes(self):
        msgids = self.extract_docstrings_from_str('_(b"""doc""")')
        self.assertFalse([msgid for msgid in msgids if 'doc' in msgid])
    def test_msgid_fstring(self):
        msgids = self.extract_docstrings_from_str('_(f"""doc""")')
        self.assertFalse([msgid for msgid in msgids if 'doc' in msgid])
    def test_funcdocstring_annotated_args(self):
        """ Test docstrings for functions with annotated args """
-        msgids = self.extract_docstrings_from_str(textwrap.dedent('''\
+        msgids = self.extract_docstrings_from_str(dedent('''\
        def foo(bar: str):
            """doc"""
        '''))
@ -119,7 +178,7 @@ class Test_pygettext(unittest.TestCase):
    def test_funcdocstring_annotated_return(self):
        """ Test docstrings for functions with annotated return type """
-        msgids = self.extract_docstrings_from_str(textwrap.dedent('''\
+        msgids = self.extract_docstrings_from_str(dedent('''\
        def foo(bar) -> str:
            """doc"""
        '''))
@ -127,7 +186,7 @@ class Test_pygettext(unittest.TestCase):
    def test_funcdocstring_defvalue_args(self):
        """ Test docstring for functions with default arg values """
-        msgids = self.extract_docstrings_from_str(textwrap.dedent('''\
+        msgids = self.extract_docstrings_from_str(dedent('''\
        def foo(bar=()):
            """doc"""
        '''))
@ -137,7 +196,7 @@ class Test_pygettext(unittest.TestCase):
        """ Test docstring extraction for multiple functions combining
        annotated args, annotated return types and default arg values
        """
-        msgids = self.extract_docstrings_from_str(textwrap.dedent('''\
+        msgids = self.extract_docstrings_from_str(dedent('''\
        def foo1(bar: tuple=()) -> str:
            """doc1"""
@ -155,7 +214,7 @@ class Test_pygettext(unittest.TestCase):
        """ Test docstring extraction for a class with colons occuring within
        the parentheses.
        """
-        msgids = self.extract_docstrings_from_str(textwrap.dedent('''\
+        msgids = self.extract_docstrings_from_str(dedent('''\
        class D(L[1:2], F({1: 2}), metaclass=M(lambda x: x)):
            """doc"""
        '''))
--- a/Misc/NEWS.d/next/Tools-Demos/2018-04-03-18-10-00.bpo-33189.QrXR00.rst
+++ b/Misc/NEWS.d/next/Tools-Demos/2018-04-03-18-10-00.bpo-33189.QrXR00.rst
@ -0,0 +1,2 @@
 :program:`pygettext.py` now recognizes only literal strings as docstrings
 and translatable strings, and rejects bytes literals and f-string expressions.
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@ -232,6 +232,10 @@ def escape_nonascii(s, encoding):
    return ''.join(escapes[b] for b in s.encode(encoding))
 def is_literal_string(s):
    return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
 def safe_eval(s):
    # unwrap quotes, safely
    return eval(s, {'__builtins__':{}}, {})
@ -317,8 +321,8 @@ class TokenEater:
    def __call__(self, ttype, tstring, stup, etup, line):
        # dispatch
 ##        import token
-##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
+##        print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
-##              'tstring:', tstring
+##              file=sys.stderr)
        self.__state(ttype, tstring, stup[0])
    def __waiting(self, ttype, tstring, lineno):
@ -327,7 +331,7 @@ class TokenEater:
        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
            # module docstring?
            if self.__freshmodule:
-                if ttype == tokenize.STRING:
+                if ttype == tokenize.STRING and is_literal_string(tstring):
                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
                    self.__freshmodule = 0
                elif ttype not in (tokenize.COMMENT, tokenize.NL):
@ -353,7 +357,7 @@ class TokenEater:
    def __suitedocstring(self, ttype, tstring, lineno):
        # ignore any intervening noise
-        if ttype == tokenize.STRING:
+        if ttype == tokenize.STRING and is_literal_string(tstring):
            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
            self.__state = self.__waiting
        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
@ -378,7 +382,7 @@ class TokenEater:
            if self.__data:
                self.__addentry(EMPTYSTRING.join(self.__data))
            self.__state = self.__waiting
-        elif ttype == tokenize.STRING:
+        elif ttype == tokenize.STRING and is_literal_string(tstring):
            self.__data.append(safe_eval(tstring))
        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
                           token.NEWLINE, tokenize.NL]:
		`@ -0,0 +1,2 @@`
							:program:`pygettext.py` now recognizes only literal strings as docstrings
							`and translatable strings, and rejects bytes literals and f-string expressions.`