diff --git a/setup/iso_codes.py b/setup/iso_codes.py index 633173ab15..3a3bfecd12 100644 --- a/setup/iso_codes.py +++ b/setup/iso_codes.py @@ -1,10 +1,14 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2023, Kovid Goyal -from setup import download_securely +import fnmatch +import os +import shutil, time import zipfile from io import BytesIO +from setup import download_securely + class ISOData: URL = 'https://salsa.debian.org/iso-codes-team/iso-codes/-/archive/main/iso-codes-main.zip' @@ -23,5 +27,16 @@ class ISOData: with zf.open(f'iso-codes-main/data/{name}') as f: return f.read() + def extract_po_files(self, name: str, output_dir: str) -> None: + name = name.split('.', 1)[0] + pat = f'iso-codes-main/{name}/*.po' + with zipfile.ZipFile(self.zip_data) as zf: + for name in fnmatch.filter(zf.namelist(), pat): + dest = os.path.join(output_dir, name.split('/')[-1]) + zi = zf.getinfo(name) + with zf.open(zi) as src, open(dest, 'wb') as d: + shutil.copyfileobj(src, d) + date_time = time.mktime(zi.date_time + (0, 0, -1)) + os.utime(dest, (date_time, date_time)) iso_data = ISOData() diff --git a/setup/translations.py b/setup/translations.py index 1c97877089..53e5adf4a9 100644 --- a/setup/translations.py +++ b/setup/translations.py @@ -91,32 +91,6 @@ class POT(Command): # {{{ return '\n'.join(ans) - def get_iso639_strings(self): - self.info('Generating translation template for iso639') - root = json.loads(iso_data.db_data('iso_639-3.json')) - entries = root['639-3'] - ans = [] - - def name_getter(x): - return x.get('inverted_name') or x.get('name') - - for x in sorted(entries, key=lambda x:name_getter(x).lower()): - name = name_getter(x) - if name: - ans.append(f'msgid "{name}"') - ans.append('msgstr ""') - ans.append('') - pot = self.pot_header() + '\n\n' + '\n'.join(ans) - dest = self.j(self.TRANSLATIONS, 'iso_639', 'iso_639_3.pot') - try: - os.makedirs(os.path.dirname(dest)) - except OSError: - pass - with open(dest, 'wb') as f: - f.write(pot.encode('utf-8')) - self.upload_pot(resource='iso639') - self.git(['add', dest]) - def get_content_server_strings(self): self.info('Generating translation template for content_server') from calibre import walk @@ -208,7 +182,6 @@ class POT(Command): # {{{ def run(self, opts): if not is_ci: require_git_master() - self.get_iso639_strings() if not is_ci: self.get_website_strings() self.get_content_server_strings() @@ -318,12 +291,12 @@ class Translations(POT): # {{{ self.compile_website_translations() self.compile_changelog_translations() - def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None): + def compile_group(self, files, handle_stats=None, action_per_file=None, make_translated_strings_unique=False, keyfunc=lambda x: x): ok_files = [] hashmap = {} def stats_cache(src, data=None): - cname = self.cache_name(src) + '.stats.json' + cname = self.cache_name(keyfunc(src)) + '.stats.json' with open(self.j(self.cache_dir, cname), ('rb' if data is None else 'wb')) as f: if data is None: return json.loads(f.read()) @@ -336,42 +309,39 @@ class Translations(POT): # {{{ base = os.path.dirname(dest) if not os.path.exists(base): os.makedirs(base) - data, h = self.hash_and_data(src) + data, h = self.hash_and_data(src, keyfunc) current_hash = h.digest() - saved_hash, saved_data = self.read_cache(src) + saved_hash, saved_data = self.read_cache(keyfunc(src)) if current_hash == saved_hash: with open(dest, 'wb') as d: d.write(saved_data) if handle_stats is not None: handle_stats(src, stats_cache(src)) else: - if file_ok is None or file_ok(data, src): - # self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations'))) - ok_files.append((src, dest)) - hashmap[src] = current_hash + ok_files.append((src, dest)) + hashmap[keyfunc(src)] = current_hash if action_per_file is not None: action_per_file(src) self.info(f'\tCompiling {len(ok_files)} files') items = [] results = batched_parallel_jobs( - [sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py'), 'STDIN'], + [sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py'), 'STDIN', 'uniqify' if make_translated_strings_unique else ' '], ok_files) - for (src, dest), nums in zip(ok_files, results): - items.append((src, dest, nums)) + for (src, dest), data in zip(ok_files, results): + items.append((src, dest, data)) - for (src, dest, nums) in items: - self.write_cache(open(dest, 'rb').read(), hashmap[src], src) - stats_cache(src, nums) + for (src, dest, data) in items: + self.write_cache(open(dest, 'rb').read(), hashmap[keyfunc(src)], keyfunc(src)) + stats_cache(src, data) if handle_stats is not None: - handle_stats(src, nums) + handle_stats(src, data) def compile_main_translations(self): l = {} lc_dataf = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lc_data.py') exec(compile(open(lc_dataf, 'rb').read(), lc_dataf, 'exec'), l, l) lcdata = {k:{k1:v1 for k1, v1 in v} for k, v in l['data']} - self.iso639_errors = [] self.info('Compiling main UI translation files...') fmap = {f:self.mo_file(f) for f in self.po_files()} files = [(f, fmap[f][1]) for f in self.po_files()] @@ -388,33 +358,41 @@ class Translations(POT): # {{{ stats = {} - def handle_stats(f, nums): - trans = nums[0] - total = trans if len(nums) == 1 else (trans + nums[1]) + def handle_stats(f, data): + trans, untrans = data['translated'], data['untranslated'] + total = trans + untrans locale = fmap[f][0] stats[locale] = min(1.0, float(trans)/total) self.compile_group(files, handle_stats=handle_stats, action_per_file=action_per_file) - self.info('Compiling ISO639 files...') + self.info('Compiling ISO639 files...') files = [] skip_iso = { - 'en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds', - 'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku', 'fr_CA', 'him', - 'jv', 'ka', 'fur', 'ber', 'my', 'fil', 'hy', 'ug'} - for f, (locale, dest) in iteritems(fmap): - iscpo = {'bn':'bn_IN', 'zh_HK':'zh_CN'}.get(locale, locale) - iso639 = self.j(self.TRANSLATIONS, 'iso_639', '%s.po'%iscpo) - if os.path.exists(iso639): - files.append((iso639, self.j(self.d(dest), 'iso639.mo'))) - elif locale not in skip_iso: - self.warn('No ISO 639 translations for locale:', locale) - self.compile_group(files, file_ok=self.check_iso639) + 'si', 'te', 'km', 'en_GB', 'en_AU', 'en_CA', 'yi', 'ku', 'my', 'uz@Latn', 'fil', 'hy', 'ltg', 'km_KH', 'km', + 'ur', 'ml', 'fo', 'ug', 'nds', 'jv' + } - if self.iso639_errors: - for err in self.iso639_errors: - print(err) - raise SystemExit(1) + def handle_stats(f, data): + if False and data['uniqified']: + print(f'{data["uniqified"]:3d} non-unique language name translations in {os.path.basename(f)}', file=sys.stderr) + + with tempfile.TemporaryDirectory() as tdir: + iso_data.extract_po_files('iso_639-3', tdir) + for f, (locale, dest) in iteritems(fmap): + iscpo = {'zh_HK':'zh_CN'}.get(locale, locale) + iso639 = self.j(tdir, '%s.po'%iscpo) + if os.path.exists(iso639): + files.append((iso639, self.j(self.d(dest), 'iso639.mo'))) + else: + iscpo = iscpo.partition('_')[0] + iso639 = self.j(tdir, '%s.po'%iscpo) + if os.path.exists(iso639): + files.append((iso639, self.j(self.d(dest), 'iso639.mo'))) + elif locale not in skip_iso: + self.warn('No ISO 639 translations for locale:', locale) + self.compile_group(files, make_translated_strings_unique=True, handle_stats=handle_stats, + keyfunc=lambda x: os.path.join(self.d(self.SRC), 'iso639', os.path.basename(x))) dest = self.stats base = self.d(dest) @@ -427,11 +405,11 @@ class Translations(POT): # {{{ with open(dest, 'wb') as f: f.write(msgpack_dumps(stats)) - def hash_and_data(self, f): + def hash_and_data(self, f, keyfunc=lambda x:x): with open(f, 'rb') as s: data = s.read() h = hashlib.sha1(data) - h.update(f.encode('utf-8')) + h.update(keyfunc(f).encode('utf-8')) return data, h def compile_content_server_translations(self): @@ -467,31 +445,6 @@ class Translations(POT): # {{{ zi.compress_type = ZIP_STORED if is_ci else ZIP_DEFLATED zf.writestr(zi, raw) - def check_iso639(self, raw, path): - from calibre.utils.localization import langnames_to_langcodes - rmap = {} - msgid = None - has_errors = False - if isinstance(raw, bytes): - raw = raw.decode('utf-8') - for match in re.finditer(r'^(msgid|msgstr)\s+"(.*?)"', raw, re.M): - if match.group(1) == 'msgid': - msgid = match.group(2) - else: - msgstr = match.group(2) - if not msgstr: - continue - omsgid = rmap.get(msgstr, None) - if omsgid is not None: - cm = langnames_to_langcodes([omsgid, msgid]) - if cm[msgid] and cm[omsgid] and cm[msgid] != cm[omsgid]: - has_errors = True - self.iso639_errors.append('In file {} the name {} is used as translation for both {} and {}'.format( - os.path.basename(path), msgstr, msgid, rmap[msgstr])) - # raise SystemExit(1) - rmap[msgstr] = msgid - return not has_errors - def freeze_locales(self): zf = self.DEST + '.zip' from calibre import CurrentDir @@ -518,10 +471,10 @@ class Translations(POT): # {{{ stats = {} done = [] - def handle_stats(src, nums): + def handle_stats(src, data): locale = fmap[src] - trans = nums[0] - total = trans if len(nums) == 1 else (trans + nums[1]) + trans, untrans = data['translated'], data['untranslated'] + total = trans + untrans stats[locale] = int(round(100 * trans / total)) with TemporaryDirectory() as tdir, ZipFile(self.j(srcbase, 'locales.zip'), 'w', ZIP_STORED) as zf: @@ -593,12 +546,11 @@ class Translations(POT): # {{{ mofile = self.j(dest, po.rpartition('.')[0] + '.mo') files.append((self.j(q, po), mofile)) - def handle_stats(src, nums): + def handle_stats(src, data): locale = self.b(self.d(src)) stats = all_stats[locale] - stats['translated'] += nums[0] - if len(nums) > 1: - stats['untranslated'] += nums[1] + stats['translated'] += data['translated'] + stats['untranslated'] += data['untranslated'] self.compile_group(files, handle_stats=handle_stats) for locale, stats in iteritems(all_stats): diff --git a/src/calibre/translations/msgfmt.py b/src/calibre/translations/msgfmt.py index 6422e75762..0f0571bc90 100644 --- a/src/calibre/translations/msgfmt.py +++ b/src/calibre/translations/msgfmt.py @@ -37,7 +37,9 @@ from email.parser import HeaderParser __version__ = "1.2" MESSAGES = {} -STATS = {'translated': 0, 'untranslated': 0} +STATS = {'translated': 0, 'untranslated': 0, 'uniqified': 0} +MAKE_UNIQUE = False +NON_UNIQUE = set() def usage(code, msg=''): @@ -53,6 +55,12 @@ def add(ctxt, msgid, msgstr, fuzzy): if msgid: STATS['translated'] += 1 if ctxt is None: + if msgstr in NON_UNIQUE: + STATS['uniqified'] += 1 + if MAKE_UNIQUE: + msgstr += b' (' + msgid + b')' + else: + NON_UNIQUE.add(msgstr) MESSAGES[msgid] = msgstr else: MESSAGES[b"%b\x04%b" % (ctxt, msgid)] = msgstr @@ -235,9 +243,10 @@ def make(filename, outfile): def make_with_stats(filename, outfile): MESSAGES.clear() - STATS['translated'] = STATS['untranslated'] = 0 + NON_UNIQUE.clear() + STATS['translated'] = STATS['untranslated'] = STATS['uniqified'] = 0 make(filename, outfile) - return STATS['translated'], STATS['untranslated'] + return STATS.copy() def run_batch(pairs): @@ -246,8 +255,10 @@ def run_batch(pairs): def main(): + global MAKE_UNIQUE args = sys.argv[1:] - if args == ['STDIN']: + if args[0] == 'STDIN': + MAKE_UNIQUE = args[1] == 'uniqify' import json results = tuple(run_batch(json.loads(sys.stdin.buffer.read()))) sys.stdout.buffer.write(json.dumps(results).encode('utf-8')) @@ -279,7 +290,7 @@ def main(): return for filename in args: - translated, untranslated = make_with_stats(filename, outfile) + make_with_stats(filename, outfile) if output_stats: print(STATS['translated'], 'translated messages,', STATS['untranslated'], 'untranslated messages.')