gh-130057: Pygettext: Support translator comments (GH-130061)

2025-10-14 18:59:46 +00:00 · 2025-02-17 11:41:28 +01:00 · 2025-02-17 11:41:28 +01:00 · aa845af9bb
commit aa845af9bb
parent 6669905723
5 changed files with 335 additions and 20 deletions
--- a/Lib/test/test_tools/i18n_data/comments.pot
+++ b/Lib/test/test_tools/i18n_data/comments.pot
@ -0,0 +1,110 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) YEAR ORGANIZATION
 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
 #
 msgid ""
 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\n"
 "POT-Creation-Date: 2000-01-01 00:00+0000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: pygettext.py 1.5\n"
 #: comments.py:4
 msgid "foo"
 msgstr ""
 #. i18n: This is a translator comment
 #: comments.py:7
 msgid "bar"
 msgstr ""
 #. i18n: This is a translator comment
 #. i18n: This is another translator comment
 #: comments.py:11
 msgid "baz"
 msgstr ""
 #. i18n: This is a translator comment
 #. with multiple
 #. lines
 #: comments.py:16
 msgid "qux"
 msgstr ""
 #. i18n: This is a translator comment
 #: comments.py:21
 msgid "quux"
 msgstr ""
 #. i18n: This is a translator comment
 #. with multiple lines
 #. i18n: This is another translator comment
 #. with multiple lines
 #: comments.py:27
 msgid "corge"
 msgstr ""
 #: comments.py:31
 msgid "grault"
 msgstr ""
 #. i18n: This is another translator comment
 #: comments.py:36
 msgid "garply"
 msgstr ""
 #: comments.py:40
 msgid "george"
 msgstr ""
 #. i18n: This is another translator comment
 #: comments.py:45
 msgid "waldo"
 msgstr ""
 #. i18n: This is a translator comment
 #. i18n: This is also a translator comment
 #. i18n: This is another translator comment
 #: comments.py:50
 msgid "waldo2"
 msgstr ""
 #. i18n: This is a translator comment
 #. i18n: This is another translator comment
 #. i18n: This is yet another translator comment
 #. i18n: This is a translator comment
 #. with multiple lines
 #: comments.py:53 comments.py:56 comments.py:59 comments.py:63
 msgid "fred"
 msgstr ""
 #: comments.py:65
 msgid "plugh"
 msgstr ""
 #: comments.py:67
 msgid "foobar"
 msgstr ""
 #. i18n: This is a translator comment
 #: comments.py:71
 msgid "xyzzy"
 msgstr ""
 #: comments.py:72
 msgid "thud"
 msgstr ""
 #. i18n: This is a translator comment
 #. i18n: This is another translator comment
 #. i18n: This is yet another translator comment
 #: comments.py:78
 msgid "foos"
 msgstr ""
--- a/Lib/test/test_tools/i18n_data/comments.py
+++ b/Lib/test/test_tools/i18n_data/comments.py
@ -0,0 +1,78 @@
 from gettext import gettext as _
 # Not a translator comment
 _('foo')
 # i18n: This is a translator comment
 _('bar')
 # i18n: This is a translator comment
 # i18n: This is another translator comment
 _('baz')
 # i18n: This is a translator comment
 # with multiple
 # lines
 _('qux')
 # This comment should not be included because
 # it does not start with the prefix
 # i18n: This is a translator comment
 _('quux')
 # i18n: This is a translator comment
 # with multiple lines
 # i18n: This is another translator comment
 # with multiple lines
 _('corge')
 # i18n: This comment should be ignored
 _('grault')
 # i18n: This comment should be ignored
 # i18n: This is another translator comment
 _('garply')
 # i18n: comment should be ignored
 x = 1
 _('george')
 # i18n: This comment should be ignored
 x = 1
 # i18n: This is another translator comment
 _('waldo')
 # i18n: This is a translator comment
 x = 1  # i18n: This is also a translator comment
 # i18n: This is another translator comment
 _('waldo2')
 # i18n: This is a translator comment
 _('fred')
 # i18n: This is another translator comment
 _('fred')
 # i18n: This is yet another translator comment
 _('fred')
 # i18n: This is a translator comment
 # with multiple lines
 _('fred')
 _('plugh')  # i18n: This comment should be ignored
 _('foo'  # i18n: This comment should be ignored
  'bar')  # i18n: This comment should be ignored
 # i18n: This is a translator comment
 _('xyzzy')
 _('thud')
 ## i18n: This is a translator comment
 # # i18n: This is another translator comment
 ### ###    i18n: This is yet another translator comment
 _('foos')
--- a/Lib/test/test_tools/test_i18n.py
+++ b/Lib/test/test_tools/test_i18n.py
@ -87,7 +87,8 @@ class Test_pygettext(unittest.TestCase):
        self.maxDiff = None
        self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual))
-    def extract_from_str(self, module_content, *, args=(), strict=True, with_stderr=False):
+    def extract_from_str(self, module_content, *, args=(), strict=True,
                         with_stderr=False, raw=False):
        """Return all msgids extracted from module_content."""
        filename = 'test.py'
        with temp_cwd(None):
@ -98,10 +99,11 @@ class Test_pygettext(unittest.TestCase):
                self.assertEqual(res.err, b'')
            with open('messages.pot', encoding='utf-8') as fp:
                data = fp.read()
-        msgids = self.get_msgids(data)
+        if not raw:
            data = self.get_msgids(data)
        if not with_stderr:
-            return msgids
+            return data
-        return msgids, res.err
+        return data, res.err
    def extract_docstrings_from_str(self, module_content):
        """Return all docstrings extracted from module_content."""
@ -381,7 +383,8 @@ class Test_pygettext(unittest.TestCase):
                contents = input_file.read_text(encoding='utf-8')
                with temp_cwd(None):
                    Path(input_file.name).write_text(contents)
-                    assert_python_ok('-Xutf8', self.script, '--docstrings', input_file.name)
+                    assert_python_ok('-Xutf8', self.script, '--docstrings',
                                     '--add-comments=i18n:', input_file.name)
                    output = Path('messages.pot').read_text(encoding='utf-8')
                expected = output_file.read_text(encoding='utf-8')
@ -437,6 +440,51 @@ class Test_pygettext(unittest.TestCase):
            "*** test.py:3: Variable positional arguments are not allowed in gettext calls\n"
        )
    def test_extract_all_comments(self):
        """
        Test that the --add-comments option without an
        explicit tag extracts all translator comments.
        """
        for arg in ('--add-comments', '-c'):
            with self.subTest(arg=arg):
                data = self.extract_from_str(dedent('''\
                # Translator comment
                _("foo")
                '''), args=(arg,), raw=True)
                self.assertIn('#. Translator comment', data)
    def test_comments_with_multiple_tags(self):
        """
        Test that multiple --add-comments tags can be specified.
        """
        for arg in ('--add-comments={}', '-c{}'):
            with self.subTest(arg=arg):
                args = (arg.format('foo:'), arg.format('bar:'))
                data = self.extract_from_str(dedent('''\
                # foo: comment
                _("foo")
                # bar: comment
                _("bar")
                # baz: comment
                _("baz")
                '''), args=args, raw=True)
                self.assertIn('#. foo: comment', data)
                self.assertIn('#. bar: comment', data)
                self.assertNotIn('#. baz: comment', data)
    def test_comments_not_extracted_without_tags(self):
        """
        Test that translator comments are not extracted without
        specifying --add-comments.
        """
        data = self.extract_from_str(dedent('''\
        # Translator comment
        _("foo")
        '''), raw=True)
        self.assertNotIn('#.', data)
 def update_POT_snapshots():
    for input_file in DATA_DIR.glob('*.py'):
@ -444,7 +492,8 @@ def update_POT_snapshots():
        contents = input_file.read_bytes()
        with temp_cwd(None):
            Path(input_file.name).write_bytes(contents)
-            assert_python_ok('-Xutf8', Test_pygettext.script, '--docstrings', input_file.name)
+            assert_python_ok('-Xutf8', Test_pygettext.script, '--docstrings',
                             '--add-comments=i18n:', input_file.name)
            output = Path('messages.pot').read_text(encoding='utf-8')
        output = normalize_POT_file(output)
--- a/Misc/NEWS.d/next/Tools-Demos/2025-02-12-23-24-37.gh-issue-130057.TKUKI6.rst
+++ b/Misc/NEWS.d/next/Tools-Demos/2025-02-12-23-24-37.gh-issue-130057.TKUKI6.rst
@ -0,0 +1 @@
 Add support for translator comments in :program:`pygettext.py`.
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@ -46,6 +46,12 @@ Options:
    --extract-all
        Extract all strings.
    -cTAG
    --add-comments=TAG
        Extract translator comments.  Comments must start with TAG and
        must precede the gettext call.  Multiple -cTAG options are allowed.
        In that case, any comment matching any of the TAGs will be extracted.
    -d name
    --default-domain=name
        Rename the default output file from messages.pot to name.pot.
@ -141,7 +147,9 @@ import importlib.util
 import os
 import sys
 import time
 import tokenize
 from dataclasses import dataclass, field
 from io import BytesIO
 from operator import itemgetter
 __version__ = '1.5'
@ -302,12 +310,30 @@ class Message:
    msgctxt: str | None
    locations: set[Location] = field(default_factory=set)
    is_docstring: bool = False
    comments: list[str] = field(default_factory=list)
-    def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=False):
+    def add_location(self, filename, lineno, msgid_plural=None, *,
                     is_docstring=False, comments=None):
        if self.msgid_plural is None:
            self.msgid_plural = msgid_plural
        self.locations.add(Location(filename, lineno))
        self.is_docstring |= is_docstring
        if comments:
            self.comments.extend(comments)
 def get_source_comments(source):
    """
    Return a dictionary mapping line numbers to
    comments in the source code.
    """
    comments = {}
    for token in tokenize.tokenize(BytesIO(source).readline):
        if token.type == tokenize.COMMENT:
            # Remove any leading combination of '#' and whitespace
            comment = token.string.lstrip('# \t')
            comments[token.start[0]] = comment
    return comments
 class GettextVisitor(ast.NodeVisitor):
@ -316,10 +342,18 @@ class GettextVisitor(ast.NodeVisitor):
        self.options = options
        self.filename = None
        self.messages = {}
        self.comments = {}
    def visit_file(self, source, filename):
        try:
            module_tree = ast.parse(source)
        except SyntaxError:
            return
    def visit_file(self, node, filename):
        self.filename = filename
-        self.visit(node)
+        if self.options.comment_tags:
            self.comments = get_source_comments(source)
        self.visit(module_tree)
    def visit_Module(self, node):
        self._extract_docstring(node)
@ -372,14 +406,51 @@ class GettextVisitor(ast.NodeVisitor):
            msg_data[arg_type] = arg.value
        lineno = node.lineno
-        self._add_message(lineno, **msg_data)
+        comments = self._extract_comments(node)
        self._add_message(lineno, **msg_data, comments=comments)
    def _extract_comments(self, node):
        """Extract translator comments.
        Translator comments must precede the gettext call and
        start with one of the comment prefixes defined by
        --add-comments=TAG. See the tests for examples.
        """
        if not self.options.comment_tags:
            return []
        comments = []
        lineno = node.lineno - 1
        # Collect an unbroken sequence of comments starting from
        # the line above the gettext call.
        while lineno >= 1:
            comment = self.comments.get(lineno)
            if comment is None:
                break
            comments.append(comment)
            lineno -= 1
        # Find the first translator comment in the sequence and
        # return all comments starting from that comment.
        comments = comments[::-1]
        first_index = next((i for i, comment in enumerate(comments)
                            if self._is_translator_comment(comment)), None)
        if first_index is None:
            return []
        return comments[first_index:]
    def _is_translator_comment(self, comment):
        return comment.startswith(self.options.comment_tags)
    def _add_message(
            self, lineno, msgid, msgid_plural=None, msgctxt=None, *,
-            is_docstring=False):
+            is_docstring=False, comments=None):
        if msgid in self.options.toexclude:
            return
        if not comments:
            comments = []
        key = self._key_for(msgid, msgctxt)
        message = self.messages.get(key)
        if message:
@ -388,6 +459,7 @@ class GettextVisitor(ast.NodeVisitor):
                lineno,
                msgid_plural,
                is_docstring=is_docstring,
                comments=comments,
            )
        else:
            self.messages[key] = Message(
@ -396,6 +468,7 @@ class GettextVisitor(ast.NodeVisitor):
                msgctxt=msgctxt,
                locations={Location(self.filename, lineno)},
                is_docstring=is_docstring,
                comments=comments,
            )
    @staticmethod
@ -435,6 +508,10 @@ def write_pot_file(messages, options, fp):
    for key, locations in sorted_keys:
        msg = messages[key]
        for comment in msg.comments:
            print(f'#. {comment}', file=fp)
        if options.writelocations:
            # location comments are different b/w Solaris and GNU:
            if options.locationstyle == options.SOLARIS:
@ -473,9 +550,9 @@ def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
-            'ad:DEhk:Kno:p:S:Vvw:x:X:',
+            'ac::d:DEhk:Kno:p:S:Vvw:x:X:',
-            ['extract-all', 'default-domain=', 'escape', 'help',
+            ['extract-all', 'add-comments=?', 'default-domain=', 'escape',
-             'keyword=', 'no-default-keywords',
+             'help', 'keyword=', 'no-default-keywords',
             'add-location', 'no-location', 'output=', 'output-dir=',
             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
             'docstrings', 'no-docstrings',
@ -501,6 +578,7 @@ def main():
        excludefilename = ''
        docstrings = 0
        nodocstrings = {}
        comment_tags = set()
    options = Options()
    locations = {'gnu' : options.GNU,
@ -513,6 +591,8 @@ def main():
            usage(0)
        elif opt in ('-a', '--extract-all'):
            options.extractall = 1
        elif opt in ('-c', '--add-comments'):
            options.comment_tags.add(arg)
        elif opt in ('-d', '--default-domain'):
            options.outfile = arg + '.pot'
        elif opt in ('-E', '--escape'):
@ -558,6 +638,8 @@ def main():
            finally:
                fp.close()
    options.comment_tags = tuple(options.comment_tags)
    # calculate escapes
    make_escapes(not options.escape)
@ -600,12 +682,7 @@ def main():
            with open(filename, 'rb') as fp:
                source = fp.read()
-        try:
+        visitor.visit_file(source, filename)
            module_tree = ast.parse(source)
        except SyntaxError:
            continue
        visitor.visit_file(module_tree, filename)
    # write the output
    if options.outfile == '-':
		`@ -0,0 +1 @@`
							Add support for translator comments in :program:`pygettext.py`.