Speed up compiling translations

Avoids calling a separate msgfmt binary for ever .po file since there
are almost 4000 of them. Instead use msgfmt.py
This commit is contained in:
Kovid Goyal 2019-12-06 09:38:15 +05:30
parent 2865326de3
commit 6811bb0cf7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 77 additions and 50 deletions

View File

@ -5,45 +5,15 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import subprocess, os import subprocess, os, itertools, json, sys
from multiprocessing.dummy import Pool from multiprocessing.dummy import Pool
from threading import Thread
from functools import partial from functools import partial
from contextlib import closing from contextlib import closing
from setup import iswindows from polyglot.builtins import unicode_type, as_bytes
from polyglot.builtins import unicode_type
if iswindows: cpu_count = min(16, max(1, os.cpu_count()))
from ctypes import windll, Structure, POINTER, c_size_t
from ctypes.wintypes import WORD, DWORD, LPVOID
class SYSTEM_INFO(Structure):
_fields_ = [
("wProcessorArchitecture", WORD),
("wReserved", WORD),
("dwPageSize", DWORD),
("lpMinimumApplicationAddress", LPVOID),
("lpMaximumApplicationAddress", LPVOID),
("dwActiveProcessorMask", c_size_t),
("dwNumberOfProcessors", DWORD),
("dwProcessorType", DWORD),
("dwAllocationGranularity", DWORD),
("wProcessorLevel", WORD),
("wProcessorRevision", WORD)]
gsi = windll.kernel32.GetSystemInfo
gsi.argtypes = [POINTER(SYSTEM_INFO)]
gsi.restype = None
si = SYSTEM_INFO()
gsi(si)
cpu_count = si.dwNumberOfProcessors
else:
from multiprocessing import cpu_count
try:
cpu_count = cpu_count()
except NotImplementedError:
cpu_count = 1
cpu_count = min(16, max(1, cpu_count))
def run_worker(job, decorate=True): def run_worker(job, decorate=True):
@ -95,3 +65,44 @@ def parallel_check_output(jobs, log):
log(stderr) log(stderr)
raise SystemExit(1) raise SystemExit(1)
yield stdout yield stdout
def get_tasks(it, size):
it = iter(it)
while 1:
x = tuple(itertools.islice(it, size))
if not x:
return
yield x
def batched_parallel_jobs(cmd, jobs, cwd=None):
chunksize, extra = divmod(len(jobs), cpu_count)
if extra:
chunksize += 1
workers = []
def get_output(p):
p.output = p.communicate(as_bytes(json.dumps(p.jobs_batch)))
for batch in get_tasks(jobs, chunksize):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd)
p.jobs_batch = batch
p.output_thread = t = Thread(target=get_output, args=(p,))
t.daemon = True
t.start()
workers.append(p)
failed = False
ans = []
for p in workers:
p.output_thread.join()
if p.wait() != 0:
sys.stderr.buffer.write(p.output[1])
sys.stderr.buffer.flush()
failed = True
else:
ans.extend(json.loads(p.output[0]))
if failed:
raise SystemExit('Worker process failed')
return ans

View File

@ -12,7 +12,7 @@ from locale import normalize as normalize_locale
from functools import partial from functools import partial
from setup import Command, __appname__, __version__, require_git_master, build_cache_dir, edit_file, dump_json from setup import Command, __appname__, __version__, require_git_master, build_cache_dir, edit_file, dump_json
from setup.parallel_build import parallel_check_output from setup.parallel_build import batched_parallel_jobs
from polyglot.builtins import codepoint_to_chr, iteritems, range from polyglot.builtins import codepoint_to_chr, iteritems, range
is_ci = os.environ.get('CI', '').lower() == 'true' is_ci = os.environ.get('CI', '').lower() == 'true'
@ -320,8 +320,7 @@ class Translations(POT): # {{{
self.compile_changelog_translations() self.compile_changelog_translations()
def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None): def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None):
from calibre.constants import islinux ok_files = []
jobs, ok_files = [], []
hashmap = {} hashmap = {}
def stats_cache(src, data=None): def stats_cache(src, data=None):
@ -349,20 +348,21 @@ class Translations(POT): # {{{
else: else:
if file_ok is None or file_ok(data, src): if file_ok is None or file_ok(data, src):
# self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations'))) # self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations')))
if islinux:
msgfmt = ['msgfmt']
else:
msgfmt = [sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py')]
jobs.append(msgfmt + ['--statistics', '-o', dest, src])
ok_files.append((src, dest)) ok_files.append((src, dest))
hashmap[src] = current_hash hashmap[src] = current_hash
if action_per_file is not None: if action_per_file is not None:
action_per_file(src) action_per_file(src)
self.info(f'\tCompiling {len(jobs)} files') self.info(f'\tCompiling {len(ok_files)} files')
for (src, dest), line in zip(ok_files, parallel_check_output(jobs, self.info)): items = []
results = batched_parallel_jobs(
[sys.executable, self.j(self.SRC, 'calibre', 'translations', 'msgfmt.py'), 'STDIN'],
ok_files)
for (src, dest), nums in zip(ok_files, results):
items.append((src, dest, nums))
for (src, dest, nums) in items:
self.write_cache(open(dest, 'rb').read(), hashmap[src], src) self.write_cache(open(dest, 'rb').read(), hashmap[src], src)
nums = tuple(map(int, re.findall(r'\d+', line)))
stats_cache(src, nums) stats_cache(src, nums)
if handle_stats is not None: if handle_stats is not None:
handle_stats(src, nums) handle_stats(src, nums)

View File

@ -50,7 +50,6 @@ def usage(code, msg=''):
def add(ctxt, id, str, fuzzy): def add(ctxt, id, str, fuzzy):
"Add a non-fuzzy translation to the dictionary." "Add a non-fuzzy translation to the dictionary."
global MESSAGES
if not fuzzy and str: if not fuzzy and str:
if id: if id:
STATS['translated'] += 1 STATS['translated'] += 1
@ -65,7 +64,6 @@ def add(ctxt, id, str, fuzzy):
def generate(): def generate():
"Return the generated output." "Return the generated output."
global MESSAGES
# the keys are sorted in the .mo file # the keys are sorted in the .mo file
keys = sorted(MESSAGES.keys()) keys = sorted(MESSAGES.keys())
offsets = [] offsets = []
@ -236,9 +234,28 @@ def make(filename, outfile):
print(msg, file=sys.stderr) print(msg, file=sys.stderr)
def make_with_stats(filename, outfile):
MESSAGES.clear()
STATS['translated'] = STATS['untranslated'] = 0
make(filename, outfile)
return STATS['translated'], STATS['untranslated']
def run_batch(pairs):
for (filename, outfile) in pairs:
yield make_with_stats(filename, outfile)
def main(): def main():
args = sys.argv[1:]
if args == ['STDIN']:
import json
results = tuple(run_batch(json.loads(sys.stdin.buffer.read())))
sys.stdout.buffer.write(json.dumps(results).encode('utf-8'))
sys.stdout.close()
return
try: try:
opts, args = getopt.getopt(sys.argv[1:], 'hVso:', opts, args = getopt.getopt(args, 'hVso:',
['help', 'version', 'statistics', 'output-file=']) ['help', 'version', 'statistics', 'output-file='])
except getopt.error as msg: except getopt.error as msg:
usage(1, msg) usage(1, msg)
@ -263,8 +280,7 @@ def main():
return return
for filename in args: for filename in args:
STATS['translated'] = STATS['untranslated'] = 0 translated, untranslated = make_with_stats(filename, outfile)
make(filename, outfile)
if output_stats: if output_stats:
print(STATS['translated'], 'translated messages,', STATS['untranslated'], 'untranslated messages.') print(STATS['translated'], 'translated messages,', STATS['untranslated'], 'untranslated messages.')