Refactor the translations builder to use a cache

This commit is contained in:
Kovid Goyal 2016-06-25 14:18:10 +05:30
parent bda0e2d812
commit a545fc497d

View File

@ -6,12 +6,12 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, tempfile, shutil, subprocess, glob, re, time, textwrap, cPickle, shlex, json, errno
import os, tempfile, shutil, subprocess, glob, re, time, textwrap, cPickle, shlex, json, errno, hashlib
from collections import defaultdict
from locale import normalize as normalize_locale
from functools import partial
from setup import Command, __appname__, __version__, require_git_master
from setup import Command, __appname__, __version__, require_git_master, build_cache_dir
from setup.parallel_build import parallel_check_output
def qt_sources():
@ -207,6 +207,39 @@ class Translations(POT): # {{{
DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization',
'locales')
@property
def cache_dir(self):
ans = self.j(build_cache_dir(), 'translations')
if not hasattr(self, 'cache_dir_created'):
self.cache_dir_created = True
try:
os.mkdir(ans)
except EnvironmentError as err:
if err.errno != errno.EEXIST:
raise
return ans
def cache_name(self, f):
f = os.path.relpath(f, self.d(self.SRC))
return f.replace(os.sep, '.').replace('/', '.').lstrip('.')
def read_cache(self, f):
cname = self.cache_name(f)
try:
with open(self.j(self.cache_dir, cname), 'rb') as f:
data = f.read()
return data[:20], data[20:]
except EnvironmentError as err:
if err.errno != errno.ENOENT:
raise
return None, None
def write_cache(self, data, h, f):
cname = self.cache_name(f)
assert len(h) == 20
with open(self.j(self.cache_dir, cname), 'wb') as f:
f.write(h), f.write(data)
def po_files(self):
return glob.glob(os.path.join(self.TRANSLATIONS, __appname__, '*.po'))
@ -215,75 +248,144 @@ class Translations(POT): # {{{
return locale, os.path.join(self.DEST, locale, 'messages.mo')
def run(self, opts):
self.compile_content_server_translations()
self.compile_main_translations()
self.write_stats()
self.compile_content_server_translations()
self.freeze_locales()
self.compile_user_manual_translations()
def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None):
jobs, ok_files = [], []
hashmap = {}
def stats_cache(src, data=None):
cname = self.cache_name(src) + '.stats.json'
with open(self.j(build_cache_dir(), cname), ('rb' if data is None else 'wb')) as f:
if data is None:
return json.load(f)
json.dump(data, f)
for src, dest in files:
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)
data, current_hash = self.hash_and_data(src)
saved_hash, saved_data = self.read_cache(src)
if current_hash == saved_hash:
with open(dest, 'wb') as d:
d.write(saved_data)
if handle_stats is not None:
handle_stats(src, stats_cache(src))
else:
if file_ok is None or file_ok(data, src):
self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations')))
jobs.append(['msgfmt', '--statistics', '-o', dest, src])
ok_files.append((src, dest))
hashmap[src] = current_hash
if action_per_file is not None:
action_per_file(src)
for (src, dest), line in zip(ok_files, parallel_check_output(jobs, self.info)):
self.write_cache(open(dest, 'rb').read(), hashmap[src], src)
nums = tuple(map(int, re.findall(r'\d+', line)))
stats_cache(src, nums)
if handle_stats is not None:
handle_stats(src, nums)
def compile_main_translations(self):
l = {}
lc_dataf = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lc_data.py')
exec(compile(open(lc_dataf, 'rb').read(), lc_dataf, 'exec'), l, l)
lcdata = {k:{k1:v1 for k1, v1 in v} for k, v in l['data']}
self.iso639_errors = []
jobs = []
for f in self.po_files():
locale, dest = self.mo_file(f)
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)
jobs.append(['msgfmt', '-o', dest, f])
iscpo = {'bn':'bn_IN', 'zh_HK':'zh_CN'}.get(locale, locale)
iso639 = self.j(self.TRANSLATIONS, 'iso_639', '%s.po'%iscpo)
if os.path.exists(iso639) and self.check_iso639(iso639):
dest = self.j(self.d(dest), 'iso639.mo')
if self.newer(dest, iso639):
jobs.append(['msgfmt', '-o', dest, iso639])
elif locale not in {
'en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds',
'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku', 'fr_CA', 'him',
'jv', 'ka', 'fur', 'ber', 'my', 'fil', 'hy', 'ug'}:
self.warn('No ISO 639 translations for locale:', locale)
self.info('Compiling main UI translation files...')
fmap = {f:self.mo_file(f) for f in self.po_files()}
files = [(f, fmap[f][1]) for f in self.po_files()]
def action_per_file(f):
locale, dest = fmap[f]
ln = normalize_locale(locale).partition('.')[0]
if ln in lcdata:
ld = lcdata[ln]
lcdest = self.j(self.d(dest), 'lcdata.pickle')
with open(lcdest, 'wb') as lcf:
lcf.write(cPickle.dumps(ld, -1))
self.info('\nCompiling %d translation files...' % len(jobs))
tuple(parallel_check_output(jobs, self.info))
stats = {}
def handle_stats(f, nums):
trans = nums[0]
total = trans if len(nums) == 1 else (trans + nums[1])
locale = fmap[f][0]
stats[locale] = min(1.0, float(trans)/total)
self.compile_group(files, handle_stats=handle_stats, action_per_file=action_per_file)
self.info('Compiling ISO639 files...')
files = []
skip_iso = {
'en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds',
'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku', 'fr_CA', 'him',
'jv', 'ka', 'fur', 'ber', 'my', 'fil', 'hy', 'ug'}
for f, (locale, dest) in fmap.iteritems():
iscpo = {'bn':'bn_IN', 'zh_HK':'zh_CN'}.get(locale, locale)
iso639 = self.j(self.TRANSLATIONS, 'iso_639', '%s.po'%iscpo)
if os.path.exists(iso639):
files.append((iso639, self.j(self.d(dest), 'iso639.mo')))
elif locale not in skip_iso:
self.warn('No ISO 639 translations for locale:', locale)
self.compile_group(files, file_ok=self.check_iso639)
if self.iso639_errors:
for err in self.iso639_errors:
print (err)
raise SystemExit(1)
dest = self.stats
base = self.d(dest)
try:
os.mkdir(base)
except EnvironmentError as err:
if err.errno != errno.EEXIST:
raise
cPickle.dump(stats, open(dest, 'wb'), -1)
def hash_and_data(self, f):
with open(f, 'rb') as s:
data = s.read()
h = hashlib.sha1(data)
h.update(f.encode('utf-8'))
return data, h.digest()
def compile_content_server_translations(self):
self.info('\nCompiling content-server translations')
self.info('Compiling content-server translations')
from calibre.utils.rapydscript import msgfmt
from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED, ZipInfo
with ZipFile(self.j(self.RESOURCES, 'content-server', 'locales.zip'), 'w', ZIP_DEFLATED) as zf:
for src in glob.glob(os.path.join(self.TRANSLATIONS, 'content-server', '*.po')):
with open(src, 'rb') as f:
po_data = f.read().decode('utf-8')
data, current_hash = self.hash_and_data(src)
saved_hash, saved_data = self.read_cache(src)
if current_hash == saved_hash:
raw = saved_data
else:
self.info('\tParsing ' + os.path.basename(src))
raw = None
po_data = data.decode('utf-8')
data = json.loads(msgfmt(po_data))
translated_entries = {k:v for k, v in data['entries'].iteritems() if v and sum(map(len, v))}
data['entries'] = translated_entries
cdata = b'{}'
if translated_entries:
raw = json.dumps(data, ensure_ascii=False, sort_keys=True)
if isinstance(raw, type(u'')):
raw = raw.encode('utf-8')
cdata = raw
self.write_cache(cdata, current_hash, src)
if raw:
zi = ZipInfo(os.path.basename(src).rpartition('.')[0])
zi.compress_type = ZIP_DEFLATED
zf.writestr(zi, raw)
def check_iso639(self, path):
def check_iso639(self, raw, path):
from calibre.utils.localization import langnames_to_langcodes
with open(path, 'rb') as f:
raw = f.read()
rmap = {}
msgid = None
has_errors = False
@ -318,36 +420,13 @@ class Translations(POT): # {{{
def stats(self):
return self.j(self.d(self.DEST), 'stats.pickle')
def write_stats(self):
files = self.po_files()
dest = self.stats
if not self.newer(dest, files):
return
self.info('Calculating translation statistics...')
stats = {}
jobs = (
['msgfmt', '--statistics', '-o', os.devnull, x] for x in files
)
for f, line in zip(files, parallel_check_output(jobs, self.info)):
nums = tuple(map(int, re.findall(r'\d+', line)))
trans = nums[0]
total = trans if len(nums) == 1 else (trans + nums[1])
locale = self.mo_file(f)[0]
stats[locale] = min(1.0, float(trans)/total)
base = self.d(dest)
try:
os.mkdir(base)
except EnvironmentError as err:
if err.errno != errno.EEXIST:
raise
cPickle.dump(stats, open(dest, 'wb'), -1)
def compile_user_manual_translations(self):
self.info('Compiling user manual translations...')
srcbase = self.j(self.d(self.SRC), 'translations', 'manual')
destbase = self.j(self.d(self.SRC), 'manual', 'locale')
complete = {}
all_stats = defaultdict(lambda : {'translated': 0, 'untranslated': 0})
files = []
for x in os.listdir(srcbase):
q = self.j(srcbase, x)
if not os.path.isdir(q):
@ -356,27 +435,27 @@ class Translations(POT): # {{{
if os.path.exists(dest):
shutil.rmtree(dest)
os.makedirs(dest)
jobs = []
for po in os.listdir(q):
if not po.endswith('.po'):
continue
jobs.append([
'msgfmt', '--statistics', '-o', self.j(
dest, po.rpartition('.')[0] + '.mo'), self.j(q, po)])
stats = tuple(parallel_check_output(jobs, self.info))
translated = untranslated = 0
for line in stats:
nums = tuple(map(int, re.findall(r'\d+', line)))
translated += nums[0]
mofile = self.j(dest, po.rpartition('.')[0] + '.mo')
files.append((self.j(q, po), mofile))
def handle_stats(src, nums):
locale = self.b(self.d(src))
stats = all_stats[locale]
stats['translated'] += nums[0]
if len(nums) > 1:
untranslated += nums[1]
stats = {'translated':translated, 'untranslated':untranslated}
with open(self.j(self.d(dest), 'stats.json'), 'wb') as f:
stats['untranslated'] += nums[1]
self.compile_group(files, handle_stats=handle_stats)
for locale, stats in all_stats.iteritems():
with open(self.j(srcbase, locale, 'stats.json'), 'wb') as f:
json.dump(stats, f)
total = translated + untranslated
total = stats['translated'] + stats['untranslated']
# Raise the 30% threshold in the future
if total and (translated / float(total)) > 0.3:
complete[x] = stats
if total and (stats['translated'] / float(total)) > 0.3:
complete[locale] = stats
with open(self.j(destbase, 'completed.json'), 'wb') as f:
json.dump(complete, f, indent=True, sort_keys=True)
@ -389,6 +468,7 @@ class Translations(POT): # {{{
destbase = self.j(self.d(self.SRC), 'manual', 'locale')
if os.path.exists(destbase):
shutil.rmtree(destbase)
shutil.rmtree(self.cache_dir)
# }}}