From e41ec8e18b078024b02a742272e675ae39778536 Mon Sep 17 00:00:00 2001 From: "Tomas R." Date: Tue, 4 Feb 2025 23:59:23 +0100 Subject: [PATCH] gh-104400: pygettext: Prepare to replace TokenEater with a NodeVisitor (#129672) * Update the module docstring * Move ``key_for`` inside the class * Move ``write_pot_file`` outside the class --- Tools/i18n/pygettext.py | 133 ++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 66 deletions(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 81d9fdbb360..d8a0e379ab8 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -7,15 +7,9 @@ the programming language and can be used from within Python programs. Martin von Loewis' work[1] helps considerably in this regard. -There's one problem though; xgettext is the program that scans source code -looking for message strings, but it groks only C (or C++). Python -introduces a few wrinkles, such as dual quoting characters, triple quoted -strings, and raw strings. xgettext understands none of this. - -Enter pygettext, which uses Python's standard tokenize module to scan -Python source code, generating .pot files identical to what GNU xgettext[2] -generates for C and C++ code. From there, the standard GNU tools can be -used. +pygettext uses Python's standard tokenize module to scan Python source +code, generating .pot files identical to what GNU xgettext[2] generates +for C and C++ code. From there, the standard GNU tools can be used. A word about marking Python strings as candidates for translation. GNU xgettext recognizes the following keywords: gettext, dgettext, dcgettext, @@ -41,6 +35,9 @@ option arguments is broken, and in these cases, pygettext just defines additional switches. +NOTE: The public interface of pygettext is limited to the command-line +interface only. The internal API is subject to change without notice. + Usage: pygettext [options] inputfile ... Options: @@ -328,12 +325,6 @@ def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=Fals self.is_docstring |= is_docstring -def key_for(msgid, msgctxt=None): - if msgctxt is not None: - return (msgctxt, msgid) - return msgid - - class TokenEater: def __init__(self, options): self.__options = options @@ -354,6 +345,10 @@ def __call__(self, ttype, tstring, stup, etup, line): ## file=sys.stderr) self.__state(ttype, tstring, stup[0]) + @property + def messages(self): + return self.__messages + def __waiting(self, ttype, tstring, lineno): opts = self.__options # Do docstring extractions, if enabled @@ -513,7 +508,7 @@ def __addentry(self, msg, lineno=None, *, is_docstring=False): lineno = self.__lineno msgctxt = msg.get('msgctxt') msgid_plural = msg.get('msgid_plural') - key = key_for(msgid, msgctxt) + key = self._key_for(msgid, msgctxt) if key in self.__messages: self.__messages[key].add_location( self.__curfile, @@ -530,6 +525,12 @@ def __addentry(self, msg, lineno=None, *, is_docstring=False): is_docstring=is_docstring, ) + @staticmethod + def _key_for(msgid, msgctxt=None): + if msgctxt is not None: + return (msgctxt, msgid) + return msgid + def warn_unexpected_token(self, token): print(( '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' @@ -543,58 +544,58 @@ def set_filename(self, filename): self.__curfile = filename self.__freshmodule = 1 - def write(self, fp): - options = self.__options - timestamp = time.strftime('%Y-%m-%d %H:%M%z') - encoding = fp.encoding if fp.encoding else 'UTF-8' - print(pot_header % {'time': timestamp, 'version': __version__, - 'charset': encoding, - 'encoding': '8bit'}, file=fp) - # Sort locations within each message by filename and lineno - sorted_keys = [ - (key, sorted(msg.locations)) - for key, msg in self.__messages.items() - ] - # Sort messages by locations - # For example, a message with locations [('test.py', 1), ('test.py', 2)] will - # appear before a message with locations [('test.py', 1), ('test.py', 3)] - sorted_keys.sort(key=itemgetter(1)) +def write_pot_file(messages, options, fp): + timestamp = time.strftime('%Y-%m-%d %H:%M%z') + encoding = fp.encoding if fp.encoding else 'UTF-8' + print(pot_header % {'time': timestamp, 'version': __version__, + 'charset': encoding, + 'encoding': '8bit'}, file=fp) - for key, locations in sorted_keys: - msg = self.__messages[key] - if options.writelocations: - # location comments are different b/w Solaris and GNU: - if options.locationstyle == options.SOLARIS: - for location in locations: - print(f'# File: {location.filename}, line: {location.lineno}', file=fp) - elif options.locationstyle == options.GNU: - # fit as many locations on one line, as long as the - # resulting line length doesn't exceed 'options.width' - locline = '#:' - for location in locations: - s = f' {location.filename}:{location.lineno}' - if len(locline) + len(s) <= options.width: - locline = locline + s - else: - print(locline, file=fp) - locline = f'#:{s}' - if len(locline) > 2: + # Sort locations within each message by filename and lineno + sorted_keys = [ + (key, sorted(msg.locations)) + for key, msg in messages.items() + ] + # Sort messages by locations + # For example, a message with locations [('test.py', 1), ('test.py', 2)] will + # appear before a message with locations [('test.py', 1), ('test.py', 3)] + sorted_keys.sort(key=itemgetter(1)) + + for key, locations in sorted_keys: + msg = messages[key] + if options.writelocations: + # location comments are different b/w Solaris and GNU: + if options.locationstyle == options.SOLARIS: + for location in locations: + print(f'# File: {location.filename}, line: {location.lineno}', file=fp) + elif options.locationstyle == options.GNU: + # fit as many locations on one line, as long as the + # resulting line length doesn't exceed 'options.width' + locline = '#:' + for location in locations: + s = f' {location.filename}:{location.lineno}' + if len(locline) + len(s) <= options.width: + locline = locline + s + else: print(locline, file=fp) - if msg.is_docstring: - # If the entry was gleaned out of a docstring, then add a - # comment stating so. This is to aid translators who may wish - # to skip translating some unimportant docstrings. - print('#, docstring', file=fp) - if msg.msgctxt is not None: - print('msgctxt', normalize(msg.msgctxt, encoding), file=fp) - print('msgid', normalize(msg.msgid, encoding), file=fp) - if msg.msgid_plural is not None: - print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp) - print('msgstr[0] ""', file=fp) - print('msgstr[1] ""\n', file=fp) - else: - print('msgstr ""\n', file=fp) + locline = f'#:{s}' + if len(locline) > 2: + print(locline, file=fp) + if msg.is_docstring: + # If the entry was gleaned out of a docstring, then add a + # comment stating so. This is to aid translators who may wish + # to skip translating some unimportant docstrings. + print('#, docstring', file=fp) + if msg.msgctxt is not None: + print('msgctxt', normalize(msg.msgctxt, encoding), file=fp) + print('msgid', normalize(msg.msgid, encoding), file=fp) + if msg.msgid_plural is not None: + print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp) + print('msgstr[0] ""', file=fp) + print('msgstr[1] ""\n', file=fp) + else: + print('msgstr ""\n', file=fp) def main(): @@ -752,7 +753,7 @@ class Options: fp = open(options.outfile, 'w') closep = 1 try: - eater.write(fp) + write_pot_file(eater.messages, options, fp) finally: if closep: fp.close()