Speed up compiling translations

Avoids calling a separate msgfmt binary for ever .po file since there
are almost 4000 of them. Instead use msgfmt.py
This commit is contained in:
Kovid Goyal 2019-12-06 09:38:15 +05:30
parent 2865326de3
commit 6811bb0cf7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 77 additions and 50 deletions

View File

@ -5,45 +5,15 @@
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import subprocess, os
import subprocess, os, itertools, json, sys
from multiprocessing.dummy import Pool
from threading import Thread
from functools import partial
from contextlib import closing
from setup import iswindows
from polyglot.builtins import unicode_type
from polyglot.builtins import unicode_type, as_bytes
if iswindows:
from ctypes import windll, Structure, POINTER, c_size_t
from ctypes.wintypes import WORD, DWORD, LPVOID
class SYSTEM_INFO(Structure):
_fields_ = [
("wProcessorArchitecture", WORD),
("wReserved", WORD),
("dwPageSize", DWORD),
("lpMinimumApplicationAddress", LPVOID),
("lpMaximumApplicationAddress", LPVOID),
("dwActiveProcessorMask", c_size_t),
("dwNumberOfProcessors", DWORD),
("dwProcessorType", DWORD),
("dwAllocationGranularity", DWORD),
("wProcessorLevel", WORD),
("wProcessorRevision", WORD)]
gsi = windll.kernel32.GetSystemInfo
gsi.argtypes = [POINTER(SYSTEM_INFO)]
gsi.restype = None
si = SYSTEM_INFO()
gsi(si)
cpu_count = si.dwNumberOfProcessors
else:
from multiprocessing import cpu_count
try:
cpu_count = cpu_count()
except NotImplementedError:
cpu_count = 1
cpu_count = min(16, max(1, cpu_count))
cpu_count = min(16, max(1, os.cpu_count()))
def run_worker(job, decorate=True):
@ -95,3 +65,44 @@ def parallel_check_output(jobs, log):
log(stderr)
raise SystemExit(1)
yield stdout
def get_tasks(it, size):
it = iter(it)
while 1:
x = tuple(itertools.islice(it, size))
if not x:
return
yield x
def batched_parallel_jobs(cmd, jobs, cwd=None):
chunksize, extra = divmod(len(jobs), cpu_count)
if extra:
chunksize += 1
workers = []
def get_output(p):
p.output = p.communicate(as_bytes(json.dumps(p.jobs_batch)))
for batch in get_tasks(jobs, chunksize):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd)
p.jobs_batch = batch
p.output_thread = t = Thread(target=get_output, args=(p,))
t.daemon = True
t.start()
workers.append(p)
failed = False
ans = []
for p in workers:
p.output_thread.join()
if p.wait() != 0:
sys.stderr.buffer.write(p.output[1])
sys.stderr.buffer.flush()
failed = True
else:
ans.extend(json.loads(p.output[0]))
if failed:
raise SystemExit('Worker process failed')
return ans

View File

@ -12,7 +12,7 @@ from locale import normalize as normalize_locale
from functools import partial
from setup import Command, __appname__, __version__, require_git_master, build_cache_dir, edit_file, dump_json
from setup.parallel_build import parallel_check_output
from setup.parallel_build import batched_parallel_jobs
from polyglot.builtins import codepoint_to_chr, iteritems, range
is_ci = os.environ.get('CI', '').lower() == 'true'
@ -320,8 +320,7 @@ class Translations(POT): # {{{
self.compile_changelog_translations()
def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None):
from calibre.constants import islinux
jobs, ok_files = [], []
ok_files = []
hashmap = {}
def stats_cache(src, data=None):
@ -349,20 +348,21 @@ class Translations(POT): # {{{
else:
if file_ok is None or file_ok(data, src):
# self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations')))
if islinux:
msgfmt = ['msgfmt']
else:
msgfmt = [sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py')]
jobs.append(msgfmt + ['--statistics', '-o', dest, src])
ok_files.append((src, dest))
hashmap[src] = current_hash
if action_per_file is not None:
action_per_file(src)
self.info(f'\tCompiling {len(jobs)} files')
for (src, dest), line in zip(ok_files, parallel_check_output(jobs, self.info)):
self.info(f'\tCompiling {len(ok_files)} files')
items = []
results = batched_parallel_jobs(
[sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py'), 'STDIN'],
ok_files)
for (src, dest), nums in zip(ok_files, results):
items.append((src, dest, nums))
for (src, dest, nums) in items:
self.write_cache(open(dest, 'rb').read(), hashmap[src], src)
nums = tuple(map(int, re.findall(r'\d+', line)))
stats_cache(src, nums)
if handle_stats is not None:
handle_stats(src, nums)

View File

@ -50,7 +50,6 @@ def usage(code, msg=''):
def add(ctxt, id, str, fuzzy):
"Add a non-fuzzy translation to the dictionary."
global MESSAGES
if not fuzzy and str:
if id:
STATS['translated'] += 1
@ -65,7 +64,6 @@ def add(ctxt, id, str, fuzzy):
def generate():
"Return the generated output."
global MESSAGES
# the keys are sorted in the .mo file
keys = sorted(MESSAGES.keys())
offsets = []
@ -236,9 +234,28 @@ def make(filename, outfile):
print(msg, file=sys.stderr)
def make_with_stats(filename, outfile):
MESSAGES.clear()
STATS['translated'] = STATS['untranslated'] = 0
make(filename, outfile)
return STATS['translated'], STATS['untranslated']
def run_batch(pairs):
for (filename, outfile) in pairs:
yield make_with_stats(filename, outfile)
def main():
args = sys.argv[1:]
if args == ['STDIN']:
import json
results = tuple(run_batch(json.loads(sys.stdin.buffer.read())))
sys.stdout.buffer.write(json.dumps(results).encode('utf-8'))
sys.stdout.close()
return
try:
opts, args = getopt.getopt(sys.argv[1:], 'hVso:',
opts, args = getopt.getopt(args, 'hVso:',
['help', 'version', 'statistics', 'output-file='])
except getopt.error as msg:
usage(1, msg)
@ -263,8 +280,7 @@ def main():
return
for filename in args:
STATS['translated'] = STATS['untranslated'] = 0
make(filename, outfile)
translated, untranslated = make_with_stats(filename, outfile)
if output_stats:
print(STATS['translated'], 'translated messages,', STATS['untranslated'], 'untranslated messages.')