From a3bf706825273d2b3eb6916674b078dd0546dca3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 13 Jun 2019 10:36:53 +0530 Subject: [PATCH] Fix msgfmt.py broken during unicode porting Also merge in changes from upstream version --- src/calibre/translations/msgfmt.py | 119 ++++++++++++++++++----------- 1 file changed, 75 insertions(+), 44 deletions(-) diff --git a/src/calibre/translations/msgfmt.py b/src/calibre/translations/msgfmt.py index b840241076..a5a2c83019 100644 --- a/src/calibre/translations/msgfmt.py +++ b/src/calibre/translations/msgfmt.py @@ -1,12 +1,15 @@ -#!/usr/bin/env python2 -# Written by Martin v. Loewis +#! /usr/bin/env python +# vim:fileencoding=utf-8 +# Written by Martin v. Löwis + from __future__ import absolute_import, division, print_function, unicode_literals """Generate binary message catalog from textual translation description. This program converts a textual Uniforum-style message catalog (.po file) into a binary GNU catalog (.mo file). This is essentially the same function as the -GNU msgfmt program, however, it is a simpler implementation. +GNU msgfmt program, however, it is a simpler implementation. Currently it +does not handle plural forms but it does handle message contexts. Usage: msgfmt.py [OPTIONS] filename.po @@ -24,15 +27,16 @@ Options: --version Display version information and exit. """ -from __future__ import print_function -import sys import os +import sys +import ast import getopt import struct import array +from email.parser import HeaderParser -__version__ = "1.1" +__version__ = "1.2" MESSAGES = {} STATS = {'translated': 0, 'untranslated': 0} @@ -45,13 +49,16 @@ def usage(code, msg=''): sys.exit(code) -def add(id, s, fuzzy): +def add(ctxt, id, str, fuzzy): "Add a non-fuzzy translation to the dictionary." global MESSAGES - if not fuzzy and s: - MESSAGES[id] = s + if not fuzzy and str: if id: STATS['translated'] += 1 + if ctxt is None: + MESSAGES[id] = str + else: + MESSAGES[b"%b\x04%b" % (ctxt, id)] = str else: if id: STATS['untranslated'] += 1 @@ -60,17 +67,16 @@ def add(id, s, fuzzy): def generate(): "Return the generated output." global MESSAGES - keys = list(MESSAGES) # the keys are sorted in the .mo file - keys.sort() + keys = sorted(MESSAGES.keys()) offsets = [] - ids = strs = '' + ids = strs = b'' for id in keys: # For each string, we need size and file offset. Each string is NUL # terminated; the NUL does not count into the size. offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) - ids += id + '\0' - strs += MESSAGES[id] + '\0' + ids += id + b'\0' + strs += MESSAGES[id] + b'\0' output = '' # The header is 7 32-bit unsigned integers. We don't use hash tables, so # the keys start right after the index tables. @@ -93,15 +99,19 @@ def generate(): 7*4, # start of key index 7*4+len(keys)*8, # start of value index 0, 0) # size and offset of hash table - output += array.array("i", offsets).tostring() - output += ids.encode('utf-8') - output += strs.encode('utf-8') + try: + output += array.array("i", offsets).tobytes() + except AttributeError: + output += array.array("i", offsets).tostring() + output += ids + output += strs return output def make(filename, outfile): ID = 1 STR = 2 + CTXT = 3 # Compute .mo name from .po name and arguments if filename.endswith('.po'): @@ -112,24 +122,29 @@ def make(filename, outfile): outfile = os.path.splitext(infile)[0] + '.mo' try: - lines = open(infile).readlines() + with open(infile, 'rb') as f: + lines = f.readlines() except IOError as msg: print(msg, file=sys.stderr) sys.exit(1) - section = None + section = msgctxt = None fuzzy = 0 + msgid = msgstr = b'' + + # Start off assuming Latin-1, so everything decodes without failure, + # until we know the exact encoding + encoding = 'latin-1' # Parse the catalog lno = 0 - msgid = msgstr = '' for l in lines: - l = l.decode('utf-8') + l = l.decode(encoding) lno += 1 # If we get a comment line after a msgstr, this is a new entry if l[0] == '#' and section == STR: - add(msgid, msgstr, fuzzy) - section = None + add(msgctxt, msgid, msgstr, fuzzy) + section = msgctxt = None fuzzy = 0 # Record a fuzzy mark if l[:2] == '#,' and 'fuzzy' in l: @@ -137,50 +152,66 @@ def make(filename, outfile): # Skip comments if l[0] == '#': continue - # Now we are in a msgid section, output previous section - if l.startswith('msgid') and not l.startswith('msgid_plural'): + # Now we are in a msgid or msgctxt section, output previous section + if l.startswith('msgctxt'): if section == STR: - add(msgid, msgstr, fuzzy) + add(msgctxt, msgid, msgstr, fuzzy) + section = CTXT + l = l[7:] + msgctxt = b'' + elif l.startswith('msgid') and not l.startswith('msgid_plural'): + if section == STR: + add(msgctxt, msgid, msgstr, fuzzy) + if not msgid: + # See whether there is an encoding declaration + p = HeaderParser() + charset = p.parsestr(msgstr.decode(encoding)).get_content_charset() + if charset: + encoding = charset section = ID l = l[5:] - msgid = msgstr = '' + msgid = msgstr = b'' is_plural = False # This is a message with plural forms elif l.startswith('msgid_plural'): if section != ID: - print('msgid_plural not preceeded by msgid on %s:%d' % - (infile, lno), file=sys.stderr) + print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno), + file=sys.stderr) sys.exit(1) l = l[12:] - msgid += '\0' # separator of singular and plural + msgid += b'\0' # separator of singular and plural is_plural = True # Now we are in a msgstr section elif l.startswith('msgstr'): section = STR if l.startswith('msgstr['): if not is_plural: - print('plural without msgid_plural on %s:%d' % - (infile, lno), file=sys.stderr) + print('plural without msgid_plural on %s:%d' % (infile, lno), + file=sys.stderr) sys.exit(1) l = l.split(']', 1)[1] if msgstr: - msgstr += '\0' # Separator of the various plural forms + msgstr += b'\0' # Separator of the various plural forms else: if is_plural: - print('indexed msgstr required for plural on %s:%d' % - (infile, lno), file=sys.stderr) + print('indexed msgstr required for plural on %s:%d' % (infile, lno), + file=sys.stderr) sys.exit(1) l = l[6:] # Skip empty lines l = l.strip() if not l: continue - # XXX: Does this always follow Python escape semantics? - l = eval(l) - if section == ID: - msgid += l + l = ast.literal_eval(l) + lb = l + if not isinstance(lb, bytes): + lb = lb.encode(encoding) + if section == CTXT: + msgctxt += lb + elif section == ID: + msgid += lb elif section == STR: - msgstr += l + msgstr += lb else: print('Syntax error on %s:%d' % (infile, lno), 'before:', file=sys.stderr) @@ -188,16 +219,16 @@ def make(filename, outfile): sys.exit(1) # Add last entry if section == STR: - add(msgid, msgstr, fuzzy) + add(msgctxt, msgid, msgstr, fuzzy) # Compute output output = generate() try: - outfile.write(output) - except AttributeError: - with open(outfile, 'wb') as f: + with open(outfile,"wb") as f: f.write(output) + except IOError as msg: + print(msg, file=sys.stderr) def main():