|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
import fintl |
|
_ = fintl.gettext |
|
except ImportError: |
|
_ = lambda s: s |
|
|
|
__doc__ = _("""pygettext -- Python equivalent of xgettext(1) |
|
|
|
Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the |
|
internationalization of C programs. Most of these tools are independent of |
|
the programming language and can be used from within Python programs. |
|
Martin von Loewis' work[1] helps considerably in this regard. |
|
|
|
There's one problem though; xgettext is the program that scans source code |
|
looking for message strings, but it groks only C (or C++). Python |
|
introduces a few wrinkles, such as dual quoting characters, triple quoted |
|
strings, and raw strings. xgettext understands none of this. |
|
|
|
Enter pygettext, which uses Python's standard tokenize module to scan |
|
Python source code, generating .pot files identical to what GNU xgettext[2] |
|
generates for C and C++ code. From there, the standard GNU tools can be |
|
used. |
|
|
|
A word about marking Python strings as candidates for translation. GNU |
|
xgettext recognizes the following keywords: gettext, dgettext, dcgettext, |
|
and gettext_noop. But those can be a lot of text to include all over your |
|
code. C and C++ have a trick: they use the C preprocessor. Most |
|
internationalized C source includes a #define for gettext() to _() so that |
|
what has to be written in the source is much less. Thus these are both |
|
translatable strings: |
|
|
|
gettext("Translatable String") |
|
_("Translatable String") |
|
|
|
Python of course has no preprocessor so this doesn't work so well. Thus, |
|
pygettext searches only for _() by default, but see the -k/--keyword flag |
|
below for how to augment this. |
|
|
|
[1] https://www.python.org/workshops/1997-10/proceedings/loewis.html |
|
[2] https://www.gnu.org/software/gettext/gettext.html |
|
|
|
NOTE: pygettext attempts to be option and feature compatible with GNU |
|
xgettext where ever possible. However some options are still missing or are |
|
not fully implemented. Also, xgettext's use of command line switches with |
|
option arguments is broken, and in these cases, pygettext just defines |
|
additional switches. |
|
|
|
Usage: pygettext [options] inputfile ... |
|
|
|
Options: |
|
|
|
-a |
|
--extract-all |
|
Extract all strings. |
|
|
|
-d name |
|
--default-domain=name |
|
Rename the default output file from messages.pot to name.pot. |
|
|
|
-E |
|
--escape |
|
Replace non-ASCII characters with octal escape sequences. |
|
|
|
-D |
|
--docstrings |
|
Extract module, class, method, and function docstrings. These do |
|
not need to be wrapped in _() markers, and in fact cannot be for |
|
Python to consider them docstrings. (See also the -X option). |
|
|
|
-h |
|
--help |
|
Print this help message and exit. |
|
|
|
-k word |
|
--keyword=word |
|
Keywords to look for in addition to the default set, which are: |
|
%(DEFAULTKEYWORDS)s |
|
|
|
You can have multiple -k flags on the command line. |
|
|
|
-K |
|
--no-default-keywords |
|
Disable the default set of keywords (see above). Any keywords |
|
explicitly added with the -k/--keyword option are still recognized. |
|
|
|
--no-location |
|
Do not write filename/lineno location comments. |
|
|
|
-n |
|
--add-location |
|
Write filename/lineno location comments indicating where each |
|
extracted string is found in the source. These lines appear before |
|
each msgid. The style of comments is controlled by the -S/--style |
|
option. This is the default. |
|
|
|
-o filename |
|
--output=filename |
|
Rename the default output file from messages.pot to filename. If |
|
filename is `-' then the output is sent to standard out. |
|
|
|
-p dir |
|
--output-dir=dir |
|
Output files will be placed in directory dir. |
|
|
|
-S stylename |
|
--style stylename |
|
Specify which style to use for location comments. Two styles are |
|
supported: |
|
|
|
Solaris # File: filename, line: line-number |
|
GNU #: filename:line |
|
|
|
The style name is case insensitive. GNU style is the default. |
|
|
|
-v |
|
--verbose |
|
Print the names of the files being processed. |
|
|
|
-V |
|
--version |
|
Print the version of pygettext and exit. |
|
|
|
-w columns |
|
--width=columns |
|
Set width of output to columns. |
|
|
|
-x filename |
|
--exclude-file=filename |
|
Specify a file that contains a list of strings that are not be |
|
extracted from the input files. Each string to be excluded must |
|
appear on a line by itself in the file. |
|
|
|
-X filename |
|
--no-docstrings=filename |
|
Specify a file that contains a list of files (one per line) that |
|
should not have their docstrings extracted. This is only useful in |
|
conjunction with the -D option above. |
|
|
|
If `inputfile' is -, standard input is read. |
|
""") |
|
|
|
import os |
|
import importlib.machinery |
|
import importlib.util |
|
import sys |
|
import glob |
|
import time |
|
import getopt |
|
import ast |
|
import token |
|
import tokenize |
|
|
|
__version__ = '1.5' |
|
|
|
default_keywords = ['_'] |
|
DEFAULTKEYWORDS = ', '.join(default_keywords) |
|
|
|
EMPTYSTRING = '' |
|
|
|
|
|
|
|
|
|
|
|
pot_header = _('''\ |
|
# SOME DESCRIPTIVE TITLE. |
|
# Copyright (C) YEAR ORGANIZATION |
|
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. |
|
# |
|
msgid "" |
|
msgstr "" |
|
"Project-Id-Version: PACKAGE VERSION\\n" |
|
"POT-Creation-Date: %(time)s\\n" |
|
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" |
|
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" |
|
"Language-Team: LANGUAGE <[email protected]>\\n" |
|
"MIME-Version: 1.0\\n" |
|
"Content-Type: text/plain; charset=%(charset)s\\n" |
|
"Content-Transfer-Encoding: %(encoding)s\\n" |
|
"Generated-By: pygettext.py %(version)s\\n" |
|
|
|
''') |
|
|
|
|
|
def usage(code, msg=''): |
|
print(__doc__ % globals(), file=sys.stderr) |
|
if msg: |
|
print(msg, file=sys.stderr) |
|
sys.exit(code) |
|
|
|
|
|
|
|
def make_escapes(pass_nonascii): |
|
global escapes, escape |
|
if pass_nonascii: |
|
|
|
|
|
|
|
mod = 128 |
|
escape = escape_ascii |
|
else: |
|
mod = 256 |
|
escape = escape_nonascii |
|
escapes = [r"\%03o" % i for i in range(mod)] |
|
for i in range(32, 127): |
|
escapes[i] = chr(i) |
|
escapes[ord('\\')] = r'\\' |
|
escapes[ord('\t')] = r'\t' |
|
escapes[ord('\r')] = r'\r' |
|
escapes[ord('\n')] = r'\n' |
|
escapes[ord('\"')] = r'\"' |
|
|
|
|
|
def escape_ascii(s, encoding): |
|
return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) |
|
|
|
def escape_nonascii(s, encoding): |
|
return ''.join(escapes[b] for b in s.encode(encoding)) |
|
|
|
|
|
def is_literal_string(s): |
|
return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') |
|
|
|
|
|
def safe_eval(s): |
|
|
|
return eval(s, {'__builtins__':{}}, {}) |
|
|
|
|
|
def normalize(s, encoding): |
|
|
|
|
|
lines = s.split('\n') |
|
if len(lines) == 1: |
|
s = '"' + escape(s, encoding) + '"' |
|
else: |
|
if not lines[-1]: |
|
del lines[-1] |
|
lines[-1] = lines[-1] + '\n' |
|
for i in range(len(lines)): |
|
lines[i] = escape(lines[i], encoding) |
|
lineterm = '\\n"\n"' |
|
s = '""\n"' + lineterm.join(lines) + '"' |
|
return s |
|
|
|
|
|
def containsAny(str, set): |
|
"""Check whether 'str' contains ANY of the chars in 'set'""" |
|
return 1 in [c in str for c in set] |
|
|
|
|
|
def getFilesForName(name): |
|
"""Get a list of module files for a filename, a module or package name, |
|
or a directory. |
|
""" |
|
if not os.path.exists(name): |
|
|
|
if containsAny(name, "*?[]"): |
|
files = glob.glob(name) |
|
list = [] |
|
for file in files: |
|
list.extend(getFilesForName(file)) |
|
return list |
|
|
|
|
|
try: |
|
spec = importlib.util.find_spec(name) |
|
name = spec.origin |
|
except ImportError: |
|
name = None |
|
if not name: |
|
return [] |
|
|
|
if os.path.isdir(name): |
|
|
|
list = [] |
|
|
|
_py_ext = importlib.machinery.SOURCE_SUFFIXES[0] |
|
for root, dirs, files in os.walk(name): |
|
|
|
if 'CVS' in dirs: |
|
dirs.remove('CVS') |
|
|
|
list.extend( |
|
[os.path.join(root, file) for file in files |
|
if os.path.splitext(file)[1] == _py_ext] |
|
) |
|
return list |
|
elif os.path.exists(name): |
|
|
|
return [name] |
|
|
|
return [] |
|
|
|
|
|
class TokenEater: |
|
def __init__(self, options): |
|
self.__options = options |
|
self.__messages = {} |
|
self.__state = self.__waiting |
|
self.__data = [] |
|
self.__lineno = -1 |
|
self.__freshmodule = 1 |
|
self.__curfile = None |
|
self.__enclosurecount = 0 |
|
|
|
def __call__(self, ttype, tstring, stup, etup, line): |
|
|
|
|
|
|
|
|
|
self.__state(ttype, tstring, stup[0]) |
|
|
|
def __waiting(self, ttype, tstring, lineno): |
|
opts = self.__options |
|
|
|
if opts.docstrings and not opts.nodocstrings.get(self.__curfile): |
|
|
|
if self.__freshmodule: |
|
if ttype == tokenize.STRING and is_literal_string(tstring): |
|
self.__addentry(safe_eval(tstring), lineno, isdocstring=1) |
|
self.__freshmodule = 0 |
|
return |
|
if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): |
|
return |
|
self.__freshmodule = 0 |
|
|
|
if ttype == tokenize.NAME and tstring in ('class', 'def'): |
|
self.__state = self.__suiteseen |
|
return |
|
if ttype == tokenize.NAME and tstring in opts.keywords: |
|
self.__state = self.__keywordseen |
|
return |
|
if ttype == tokenize.STRING: |
|
maybe_fstring = ast.parse(tstring, mode='eval').body |
|
if not isinstance(maybe_fstring, ast.JoinedStr): |
|
return |
|
for value in filter(lambda node: isinstance(node, ast.FormattedValue), |
|
maybe_fstring.values): |
|
for call in filter(lambda node: isinstance(node, ast.Call), |
|
ast.walk(value)): |
|
func = call.func |
|
if isinstance(func, ast.Name): |
|
func_name = func.id |
|
elif isinstance(func, ast.Attribute): |
|
func_name = func.attr |
|
else: |
|
continue |
|
|
|
if func_name not in opts.keywords: |
|
continue |
|
if len(call.args) != 1: |
|
print(_( |
|
'*** %(file)s:%(lineno)s: Seen unexpected amount of' |
|
' positional arguments in gettext call: %(source_segment)s' |
|
) % { |
|
'source_segment': ast.get_source_segment(tstring, call) or tstring, |
|
'file': self.__curfile, |
|
'lineno': lineno |
|
}, file=sys.stderr) |
|
continue |
|
if call.keywords: |
|
print(_( |
|
'*** %(file)s:%(lineno)s: Seen unexpected keyword arguments' |
|
' in gettext call: %(source_segment)s' |
|
) % { |
|
'source_segment': ast.get_source_segment(tstring, call) or tstring, |
|
'file': self.__curfile, |
|
'lineno': lineno |
|
}, file=sys.stderr) |
|
continue |
|
arg = call.args[0] |
|
if not isinstance(arg, ast.Constant): |
|
print(_( |
|
'*** %(file)s:%(lineno)s: Seen unexpected argument type' |
|
' in gettext call: %(source_segment)s' |
|
) % { |
|
'source_segment': ast.get_source_segment(tstring, call) or tstring, |
|
'file': self.__curfile, |
|
'lineno': lineno |
|
}, file=sys.stderr) |
|
continue |
|
if isinstance(arg.value, str): |
|
self.__addentry(arg.value, lineno) |
|
|
|
def __suiteseen(self, ttype, tstring, lineno): |
|
|
|
if ttype == tokenize.OP: |
|
if tstring == ':' and self.__enclosurecount == 0: |
|
|
|
self.__state = self.__suitedocstring |
|
elif tstring in '([{': |
|
self.__enclosurecount += 1 |
|
elif tstring in ')]}': |
|
self.__enclosurecount -= 1 |
|
|
|
def __suitedocstring(self, ttype, tstring, lineno): |
|
|
|
if ttype == tokenize.STRING and is_literal_string(tstring): |
|
self.__addentry(safe_eval(tstring), lineno, isdocstring=1) |
|
self.__state = self.__waiting |
|
elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, |
|
tokenize.COMMENT): |
|
|
|
self.__state = self.__waiting |
|
|
|
def __keywordseen(self, ttype, tstring, lineno): |
|
if ttype == tokenize.OP and tstring == '(': |
|
self.__data = [] |
|
self.__lineno = lineno |
|
self.__state = self.__openseen |
|
else: |
|
self.__state = self.__waiting |
|
|
|
def __openseen(self, ttype, tstring, lineno): |
|
if ttype == tokenize.OP and tstring == ')': |
|
|
|
|
|
|
|
|
|
if self.__data: |
|
self.__addentry(EMPTYSTRING.join(self.__data)) |
|
self.__state = self.__waiting |
|
elif ttype == tokenize.STRING and is_literal_string(tstring): |
|
self.__data.append(safe_eval(tstring)) |
|
elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, |
|
token.NEWLINE, tokenize.NL]: |
|
|
|
print(_( |
|
'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' |
|
) % { |
|
'token': tstring, |
|
'file': self.__curfile, |
|
'lineno': self.__lineno |
|
}, file=sys.stderr) |
|
self.__state = self.__waiting |
|
|
|
def __addentry(self, msg, lineno=None, isdocstring=0): |
|
if lineno is None: |
|
lineno = self.__lineno |
|
if not msg in self.__options.toexclude: |
|
entry = (self.__curfile, lineno) |
|
self.__messages.setdefault(msg, {})[entry] = isdocstring |
|
|
|
def set_filename(self, filename): |
|
self.__curfile = filename |
|
self.__freshmodule = 1 |
|
|
|
def write(self, fp): |
|
options = self.__options |
|
timestamp = time.strftime('%Y-%m-%d %H:%M%z') |
|
encoding = fp.encoding if fp.encoding else 'UTF-8' |
|
print(pot_header % {'time': timestamp, 'version': __version__, |
|
'charset': encoding, |
|
'encoding': '8bit'}, file=fp) |
|
|
|
|
|
reverse = {} |
|
for k, v in self.__messages.items(): |
|
keys = sorted(v.keys()) |
|
reverse.setdefault(tuple(keys), []).append((k, v)) |
|
rkeys = sorted(reverse.keys()) |
|
for rkey in rkeys: |
|
rentries = reverse[rkey] |
|
rentries.sort() |
|
for k, v in rentries: |
|
|
|
|
|
|
|
isdocstring = any(v.values()) |
|
|
|
|
|
|
|
v = sorted(v.keys()) |
|
if not options.writelocations: |
|
pass |
|
|
|
elif options.locationstyle == options.SOLARIS: |
|
for filename, lineno in v: |
|
d = {'filename': filename, 'lineno': lineno} |
|
print(_( |
|
'# File: %(filename)s, line: %(lineno)d') % d, file=fp) |
|
elif options.locationstyle == options.GNU: |
|
|
|
|
|
locline = '#:' |
|
for filename, lineno in v: |
|
d = {'filename': filename, 'lineno': lineno} |
|
s = _(' %(filename)s:%(lineno)d') % d |
|
if len(locline) + len(s) <= options.width: |
|
locline = locline + s |
|
else: |
|
print(locline, file=fp) |
|
locline = "#:" + s |
|
if len(locline) > 2: |
|
print(locline, file=fp) |
|
if isdocstring: |
|
print('#, docstring', file=fp) |
|
print('msgid', normalize(k, encoding), file=fp) |
|
print('msgstr ""\n', file=fp) |
|
|
|
|
|
|
|
def main(): |
|
global default_keywords |
|
try: |
|
opts, args = getopt.getopt( |
|
sys.argv[1:], |
|
'ad:DEhk:Kno:p:S:Vvw:x:X:', |
|
['extract-all', 'default-domain=', 'escape', 'help', |
|
'keyword=', 'no-default-keywords', |
|
'add-location', 'no-location', 'output=', 'output-dir=', |
|
'style=', 'verbose', 'version', 'width=', 'exclude-file=', |
|
'docstrings', 'no-docstrings', |
|
]) |
|
except getopt.error as msg: |
|
usage(1, msg) |
|
|
|
|
|
class Options: |
|
|
|
GNU = 1 |
|
SOLARIS = 2 |
|
|
|
extractall = 0 |
|
escape = 0 |
|
keywords = [] |
|
outpath = '' |
|
outfile = 'messages.pot' |
|
writelocations = 1 |
|
locationstyle = GNU |
|
verbose = 0 |
|
width = 78 |
|
excludefilename = '' |
|
docstrings = 0 |
|
nodocstrings = {} |
|
|
|
options = Options() |
|
locations = {'gnu' : options.GNU, |
|
'solaris' : options.SOLARIS, |
|
} |
|
|
|
|
|
for opt, arg in opts: |
|
if opt in ('-h', '--help'): |
|
usage(0) |
|
elif opt in ('-a', '--extract-all'): |
|
options.extractall = 1 |
|
elif opt in ('-d', '--default-domain'): |
|
options.outfile = arg + '.pot' |
|
elif opt in ('-E', '--escape'): |
|
options.escape = 1 |
|
elif opt in ('-D', '--docstrings'): |
|
options.docstrings = 1 |
|
elif opt in ('-k', '--keyword'): |
|
options.keywords.append(arg) |
|
elif opt in ('-K', '--no-default-keywords'): |
|
default_keywords = [] |
|
elif opt in ('-n', '--add-location'): |
|
options.writelocations = 1 |
|
elif opt in ('--no-location',): |
|
options.writelocations = 0 |
|
elif opt in ('-S', '--style'): |
|
options.locationstyle = locations.get(arg.lower()) |
|
if options.locationstyle is None: |
|
usage(1, _('Invalid value for --style: %s') % arg) |
|
elif opt in ('-o', '--output'): |
|
options.outfile = arg |
|
elif opt in ('-p', '--output-dir'): |
|
options.outpath = arg |
|
elif opt in ('-v', '--verbose'): |
|
options.verbose = 1 |
|
elif opt in ('-V', '--version'): |
|
print(_('pygettext.py (xgettext for Python) %s') % __version__) |
|
sys.exit(0) |
|
elif opt in ('-w', '--width'): |
|
try: |
|
options.width = int(arg) |
|
except ValueError: |
|
usage(1, _('--width argument must be an integer: %s') % arg) |
|
elif opt in ('-x', '--exclude-file'): |
|
options.excludefilename = arg |
|
elif opt in ('-X', '--no-docstrings'): |
|
fp = open(arg) |
|
try: |
|
while 1: |
|
line = fp.readline() |
|
if not line: |
|
break |
|
options.nodocstrings[line[:-1]] = 1 |
|
finally: |
|
fp.close() |
|
|
|
|
|
make_escapes(not options.escape) |
|
|
|
|
|
options.keywords.extend(default_keywords) |
|
|
|
|
|
if options.excludefilename: |
|
try: |
|
with open(options.excludefilename) as fp: |
|
options.toexclude = fp.readlines() |
|
except IOError: |
|
print(_( |
|
"Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) |
|
sys.exit(1) |
|
else: |
|
options.toexclude = [] |
|
|
|
|
|
expanded = [] |
|
for arg in args: |
|
if arg == '-': |
|
expanded.append(arg) |
|
else: |
|
expanded.extend(getFilesForName(arg)) |
|
args = expanded |
|
|
|
|
|
eater = TokenEater(options) |
|
for filename in args: |
|
if filename == '-': |
|
if options.verbose: |
|
print(_('Reading standard input')) |
|
fp = sys.stdin.buffer |
|
closep = 0 |
|
else: |
|
if options.verbose: |
|
print(_('Working on %s') % filename) |
|
fp = open(filename, 'rb') |
|
closep = 1 |
|
try: |
|
eater.set_filename(filename) |
|
try: |
|
tokens = tokenize.tokenize(fp.readline) |
|
for _token in tokens: |
|
eater(*_token) |
|
except tokenize.TokenError as e: |
|
print('%s: %s, line %d, column %d' % ( |
|
e.args[0], filename, e.args[1][0], e.args[1][1]), |
|
file=sys.stderr) |
|
finally: |
|
if closep: |
|
fp.close() |
|
|
|
|
|
if options.outfile == '-': |
|
fp = sys.stdout |
|
closep = 0 |
|
else: |
|
if options.outpath: |
|
options.outfile = os.path.join(options.outpath, options.outfile) |
|
fp = open(options.outfile, 'w') |
|
closep = 1 |
|
try: |
|
eater.write(fp) |
|
finally: |
|
if closep: |
|
fp.close() |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|
|
_('*** Seen unexpected token "%(token)s"') % {'token': 'test'} |
|
_('more' 'than' 'one' 'string') |
|
|