Speed up compiling translations

Avoids calling a separate msgfmt binary for ever .po file since there are almost 4000 of them. Instead use msgfmt.py
2025-07-09 03:04:10 -04:00 · 2019-12-06 09:38:15 +05:30 · 2019-12-06 09:38:15 +05:30 · 6811bb0cf7
commit 6811bb0cf7
parent 2865326de3
3 changed files with 77 additions and 50 deletions
--- a/setup/parallel_build.py
+++ b/setup/parallel_build.py
@ -5,45 +5,15 @@
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

-import subprocess, os
+import subprocess, os, itertools, json, sys
 from multiprocessing.dummy import Pool
+from threading import Thread
 from functools import partial
 from contextlib import closing

-from setup import iswindows
-from polyglot.builtins import unicode_type
+from polyglot.builtins import unicode_type, as_bytes

-if iswindows:
-    from ctypes import windll, Structure, POINTER, c_size_t
-    from ctypes.wintypes import WORD, DWORD, LPVOID
-
-    class SYSTEM_INFO(Structure):
-        _fields_ = [
-            ("wProcessorArchitecture",      WORD),
-            ("wReserved",                   WORD),
-            ("dwPageSize",                  DWORD),
-            ("lpMinimumApplicationAddress", LPVOID),
-            ("lpMaximumApplicationAddress", LPVOID),
-            ("dwActiveProcessorMask",       c_size_t),
-            ("dwNumberOfProcessors",        DWORD),
-            ("dwProcessorType",             DWORD),
-            ("dwAllocationGranularity",     DWORD),
-            ("wProcessorLevel",             WORD),
-            ("wProcessorRevision",          WORD)]
-    gsi = windll.kernel32.GetSystemInfo
-    gsi.argtypes = [POINTER(SYSTEM_INFO)]
-    gsi.restype = None
-    si = SYSTEM_INFO()
-    gsi(si)
-    cpu_count = si.dwNumberOfProcessors
-else:
-    from multiprocessing import cpu_count
-    try:
-        cpu_count = cpu_count()
-    except NotImplementedError:
-        cpu_count = 1
-
-cpu_count = min(16, max(1, cpu_count))
+cpu_count = min(16, max(1, os.cpu_count()))


 def run_worker(job, decorate=True):
@ -95,3 +65,44 @@ def parallel_check_output(jobs, log):
                    log(stderr)
                raise SystemExit(1)
            yield stdout
+
+
+def get_tasks(it, size):
+    it = iter(it)
+    while 1:
+        x = tuple(itertools.islice(it, size))
+        if not x:
+            return
+        yield x
+
+
+def batched_parallel_jobs(cmd, jobs, cwd=None):
+    chunksize, extra = divmod(len(jobs), cpu_count)
+    if extra:
+        chunksize += 1
+    workers = []
+
+    def get_output(p):
+        p.output = p.communicate(as_bytes(json.dumps(p.jobs_batch)))
+
+    for batch in get_tasks(jobs, chunksize):
+        p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd)
+        p.jobs_batch = batch
+        p.output_thread = t = Thread(target=get_output, args=(p,))
+        t.daemon = True
+        t.start()
+        workers.append(p)
+
+    failed = False
+    ans = []
+    for p in workers:
+        p.output_thread.join()
+        if p.wait() != 0:
+            sys.stderr.buffer.write(p.output[1])
+            sys.stderr.buffer.flush()
+            failed = True
+        else:
+            ans.extend(json.loads(p.output[0]))
+    if failed:
+        raise SystemExit('Worker process failed')
+    return ans
--- a/setup/translations.py
+++ b/setup/translations.py
@ -12,7 +12,7 @@ from locale import normalize as normalize_locale
 from functools import partial

 from setup import Command, __appname__, __version__, require_git_master, build_cache_dir, edit_file, dump_json
-from setup.parallel_build import parallel_check_output
+from setup.parallel_build import batched_parallel_jobs
 from polyglot.builtins import codepoint_to_chr, iteritems, range
 is_ci = os.environ.get('CI', '').lower() == 'true'

@ -320,8 +320,7 @@ class Translations(POT):  # {{{
        self.compile_changelog_translations()

    def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None):
-        from calibre.constants import islinux
-        jobs, ok_files = [], []
+        ok_files = []
        hashmap = {}

        def stats_cache(src, data=None):
@ -349,20 +348,21 @@ class Translations(POT):  # {{{
            else:
                if file_ok is None or file_ok(data, src):
                    # self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations')))
-                    if islinux:
-                        msgfmt = ['msgfmt']
-                    else:
-                        msgfmt = [sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py')]
-                    jobs.append(msgfmt + ['--statistics', '-o', dest, src])
                    ok_files.append((src, dest))
                    hashmap[src] = current_hash
            if action_per_file is not None:
                action_per_file(src)

-        self.info(f'\tCompiling {len(jobs)} files')
-        for (src, dest), line in zip(ok_files, parallel_check_output(jobs, self.info)):
+        self.info(f'\tCompiling {len(ok_files)} files')
+        items = []
+        results = batched_parallel_jobs(
+            [sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py'), 'STDIN'],
+            ok_files)
+        for (src, dest), nums in zip(ok_files, results):
+            items.append((src, dest, nums))
+
+        for (src, dest, nums) in items:
            self.write_cache(open(dest, 'rb').read(), hashmap[src], src)
-            nums = tuple(map(int, re.findall(r'\d+', line)))
            stats_cache(src, nums)
            if handle_stats is not None:
                handle_stats(src, nums)
--- a/src/calibre/translations/msgfmt.py
+++ b/src/calibre/translations/msgfmt.py
@ -50,7 +50,6 @@ def usage(code, msg=''):

 def add(ctxt, id, str, fuzzy):
    "Add a non-fuzzy translation to the dictionary."
-    global MESSAGES
    if not fuzzy and str:
        if id:
            STATS['translated'] += 1
@ -65,7 +64,6 @@ def add(ctxt, id, str, fuzzy):

 def generate():
    "Return the generated output."
-    global MESSAGES
    # the keys are sorted in the .mo file
    keys = sorted(MESSAGES.keys())
    offsets = []
@ -236,9 +234,28 @@ def make(filename, outfile):
        print(msg, file=sys.stderr)


+def make_with_stats(filename, outfile):
+    MESSAGES.clear()
+    STATS['translated'] = STATS['untranslated'] = 0
+    make(filename, outfile)
+    return STATS['translated'], STATS['untranslated']
+
+
+def run_batch(pairs):
+    for (filename, outfile) in pairs:
+        yield make_with_stats(filename, outfile)
+
+
 def main():
+    args = sys.argv[1:]
+    if args == ['STDIN']:
+        import json
+        results = tuple(run_batch(json.loads(sys.stdin.buffer.read())))
+        sys.stdout.buffer.write(json.dumps(results).encode('utf-8'))
+        sys.stdout.close()
+        return
    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'hVso:',
+        opts, args = getopt.getopt(args, 'hVso:',
                                   ['help', 'version', 'statistics', 'output-file='])
    except getopt.error as msg:
        usage(1, msg)
@ -263,8 +280,7 @@ def main():
        return

    for filename in args:
-        STATS['translated'] = STATS['untranslated'] = 0
-        make(filename, outfile)
+        translated, untranslated = make_with_stats(filename, outfile)
        if output_stats:
            print(STATS['translated'], 'translated messages,', STATS['untranslated'], 'untranslated messages.')