Refactor the translations builder to use a cache

This commit is contained in:
Kovid Goyal 2016-06-25 14:18:10 +05:30
parent bda0e2d812
commit a545fc497d

View File

@ -6,12 +6,12 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, tempfile, shutil, subprocess, glob, re, time, textwrap, cPickle, shlex, json, errno import os, tempfile, shutil, subprocess, glob, re, time, textwrap, cPickle, shlex, json, errno, hashlib
from collections import defaultdict from collections import defaultdict
from locale import normalize as normalize_locale from locale import normalize as normalize_locale
from functools import partial from functools import partial
from setup import Command, __appname__, __version__, require_git_master from setup import Command, __appname__, __version__, require_git_master, build_cache_dir
from setup.parallel_build import parallel_check_output from setup.parallel_build import parallel_check_output
def qt_sources(): def qt_sources():
@ -207,6 +207,39 @@ class Translations(POT): # {{{
DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization', DEST = os.path.join(os.path.dirname(POT.SRC), 'resources', 'localization',
'locales') 'locales')
@property
def cache_dir(self):
ans = self.j(build_cache_dir(), 'translations')
if not hasattr(self, 'cache_dir_created'):
self.cache_dir_created = True
try:
os.mkdir(ans)
except EnvironmentError as err:
if err.errno != errno.EEXIST:
raise
return ans
def cache_name(self, f):
f = os.path.relpath(f, self.d(self.SRC))
return f.replace(os.sep, '.').replace('/', '.').lstrip('.')
def read_cache(self, f):
cname = self.cache_name(f)
try:
with open(self.j(self.cache_dir, cname), 'rb') as f:
data = f.read()
return data[:20], data[20:]
except EnvironmentError as err:
if err.errno != errno.ENOENT:
raise
return None, None
def write_cache(self, data, h, f):
cname = self.cache_name(f)
assert len(h) == 20
with open(self.j(self.cache_dir, cname), 'wb') as f:
f.write(h), f.write(data)
def po_files(self): def po_files(self):
return glob.glob(os.path.join(self.TRANSLATIONS, __appname__, '*.po')) return glob.glob(os.path.join(self.TRANSLATIONS, __appname__, '*.po'))
@ -215,75 +248,144 @@ class Translations(POT): # {{{
return locale, os.path.join(self.DEST, locale, 'messages.mo') return locale, os.path.join(self.DEST, locale, 'messages.mo')
def run(self, opts): def run(self, opts):
self.compile_content_server_translations()
self.compile_main_translations() self.compile_main_translations()
self.write_stats() self.compile_content_server_translations()
self.freeze_locales() self.freeze_locales()
self.compile_user_manual_translations() self.compile_user_manual_translations()
def compile_group(self, files, handle_stats=None, file_ok=None, action_per_file=None):
jobs, ok_files = [], []
hashmap = {}
def stats_cache(src, data=None):
cname = self.cache_name(src) + '.stats.json'
with open(self.j(build_cache_dir(), cname), ('rb' if data is None else 'wb')) as f:
if data is None:
return json.load(f)
json.dump(data, f)
for src, dest in files:
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)
data, current_hash = self.hash_and_data(src)
saved_hash, saved_data = self.read_cache(src)
if current_hash == saved_hash:
with open(dest, 'wb') as d:
d.write(saved_data)
if handle_stats is not None:
handle_stats(src, stats_cache(src))
else:
if file_ok is None or file_ok(data, src):
self.info('\t' + os.path.relpath(src, self.j(self.d(self.SRC), 'translations')))
jobs.append(['msgfmt', '--statistics', '-o', dest, src])
ok_files.append((src, dest))
hashmap[src] = current_hash
if action_per_file is not None:
action_per_file(src)
for (src, dest), line in zip(ok_files, parallel_check_output(jobs, self.info)):
self.write_cache(open(dest, 'rb').read(), hashmap[src], src)
nums = tuple(map(int, re.findall(r'\d+', line)))
stats_cache(src, nums)
if handle_stats is not None:
handle_stats(src, nums)
def compile_main_translations(self): def compile_main_translations(self):
l = {} l = {}
lc_dataf = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lc_data.py') lc_dataf = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lc_data.py')
exec(compile(open(lc_dataf, 'rb').read(), lc_dataf, 'exec'), l, l) exec(compile(open(lc_dataf, 'rb').read(), lc_dataf, 'exec'), l, l)
lcdata = {k:{k1:v1 for k1, v1 in v} for k, v in l['data']} lcdata = {k:{k1:v1 for k1, v1 in v} for k, v in l['data']}
self.iso639_errors = [] self.iso639_errors = []
jobs = [] self.info('Compiling main UI translation files...')
for f in self.po_files(): fmap = {f:self.mo_file(f) for f in self.po_files()}
locale, dest = self.mo_file(f) files = [(f, fmap[f][1]) for f in self.po_files()]
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)
jobs.append(['msgfmt', '-o', dest, f])
iscpo = {'bn':'bn_IN', 'zh_HK':'zh_CN'}.get(locale, locale)
iso639 = self.j(self.TRANSLATIONS, 'iso_639', '%s.po'%iscpo)
if os.path.exists(iso639) and self.check_iso639(iso639):
dest = self.j(self.d(dest), 'iso639.mo')
if self.newer(dest, iso639):
jobs.append(['msgfmt', '-o', dest, iso639])
elif locale not in {
'en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds',
'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku', 'fr_CA', 'him',
'jv', 'ka', 'fur', 'ber', 'my', 'fil', 'hy', 'ug'}:
self.warn('No ISO 639 translations for locale:', locale)
def action_per_file(f):
locale, dest = fmap[f]
ln = normalize_locale(locale).partition('.')[0] ln = normalize_locale(locale).partition('.')[0]
if ln in lcdata: if ln in lcdata:
ld = lcdata[ln] ld = lcdata[ln]
lcdest = self.j(self.d(dest), 'lcdata.pickle') lcdest = self.j(self.d(dest), 'lcdata.pickle')
with open(lcdest, 'wb') as lcf: with open(lcdest, 'wb') as lcf:
lcf.write(cPickle.dumps(ld, -1)) lcf.write(cPickle.dumps(ld, -1))
self.info('\nCompiling %d translation files...' % len(jobs))
tuple(parallel_check_output(jobs, self.info)) stats = {}
def handle_stats(f, nums):
trans = nums[0]
total = trans if len(nums) == 1 else (trans + nums[1])
locale = fmap[f][0]
stats[locale] = min(1.0, float(trans)/total)
self.compile_group(files, handle_stats=handle_stats, action_per_file=action_per_file)
self.info('Compiling ISO639 files...')
files = []
skip_iso = {
'en_GB', 'en_CA', 'en_AU', 'si', 'ur', 'sc', 'ltg', 'nds',
'te', 'yi', 'fo', 'sq', 'ast', 'ml', 'ku', 'fr_CA', 'him',
'jv', 'ka', 'fur', 'ber', 'my', 'fil', 'hy', 'ug'}
for f, (locale, dest) in fmap.iteritems():
iscpo = {'bn':'bn_IN', 'zh_HK':'zh_CN'}.get(locale, locale)
iso639 = self.j(self.TRANSLATIONS, 'iso_639', '%s.po'%iscpo)
if os.path.exists(iso639):
files.append((iso639, self.j(self.d(dest), 'iso639.mo')))
elif locale not in skip_iso:
self.warn('No ISO 639 translations for locale:', locale)
self.compile_group(files, file_ok=self.check_iso639)
if self.iso639_errors: if self.iso639_errors:
for err in self.iso639_errors: for err in self.iso639_errors:
print (err) print (err)
raise SystemExit(1) raise SystemExit(1)
dest = self.stats
base = self.d(dest)
try:
os.mkdir(base)
except EnvironmentError as err:
if err.errno != errno.EEXIST:
raise
cPickle.dump(stats, open(dest, 'wb'), -1)
def hash_and_data(self, f):
with open(f, 'rb') as s:
data = s.read()
h = hashlib.sha1(data)
h.update(f.encode('utf-8'))
return data, h.digest()
def compile_content_server_translations(self): def compile_content_server_translations(self):
self.info('\nCompiling content-server translations') self.info('Compiling content-server translations')
from calibre.utils.rapydscript import msgfmt from calibre.utils.rapydscript import msgfmt
from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED, ZipInfo from calibre.utils.zipfile import ZipFile, ZIP_DEFLATED, ZipInfo
with ZipFile(self.j(self.RESOURCES, 'content-server', 'locales.zip'), 'w', ZIP_DEFLATED) as zf: with ZipFile(self.j(self.RESOURCES, 'content-server', 'locales.zip'), 'w', ZIP_DEFLATED) as zf:
for src in glob.glob(os.path.join(self.TRANSLATIONS, 'content-server', '*.po')): for src in glob.glob(os.path.join(self.TRANSLATIONS, 'content-server', '*.po')):
with open(src, 'rb') as f: data, current_hash = self.hash_and_data(src)
po_data = f.read().decode('utf-8') saved_hash, saved_data = self.read_cache(src)
if current_hash == saved_hash:
raw = saved_data
else:
self.info('\tParsing ' + os.path.basename(src))
raw = None
po_data = data.decode('utf-8')
data = json.loads(msgfmt(po_data)) data = json.loads(msgfmt(po_data))
translated_entries = {k:v for k, v in data['entries'].iteritems() if v and sum(map(len, v))} translated_entries = {k:v for k, v in data['entries'].iteritems() if v and sum(map(len, v))}
data['entries'] = translated_entries data['entries'] = translated_entries
cdata = b'{}'
if translated_entries: if translated_entries:
raw = json.dumps(data, ensure_ascii=False, sort_keys=True) raw = json.dumps(data, ensure_ascii=False, sort_keys=True)
if isinstance(raw, type(u'')): if isinstance(raw, type(u'')):
raw = raw.encode('utf-8') raw = raw.encode('utf-8')
cdata = raw
self.write_cache(cdata, current_hash, src)
if raw:
zi = ZipInfo(os.path.basename(src).rpartition('.')[0]) zi = ZipInfo(os.path.basename(src).rpartition('.')[0])
zi.compress_type = ZIP_DEFLATED zi.compress_type = ZIP_DEFLATED
zf.writestr(zi, raw) zf.writestr(zi, raw)
def check_iso639(self, path): def check_iso639(self, raw, path):
from calibre.utils.localization import langnames_to_langcodes from calibre.utils.localization import langnames_to_langcodes
with open(path, 'rb') as f:
raw = f.read()
rmap = {} rmap = {}
msgid = None msgid = None
has_errors = False has_errors = False
@ -318,36 +420,13 @@ class Translations(POT): # {{{
def stats(self): def stats(self):
return self.j(self.d(self.DEST), 'stats.pickle') return self.j(self.d(self.DEST), 'stats.pickle')
def write_stats(self):
files = self.po_files()
dest = self.stats
if not self.newer(dest, files):
return
self.info('Calculating translation statistics...')
stats = {}
jobs = (
['msgfmt', '--statistics', '-o', os.devnull, x] for x in files
)
for f, line in zip(files, parallel_check_output(jobs, self.info)):
nums = tuple(map(int, re.findall(r'\d+', line)))
trans = nums[0]
total = trans if len(nums) == 1 else (trans + nums[1])
locale = self.mo_file(f)[0]
stats[locale] = min(1.0, float(trans)/total)
base = self.d(dest)
try:
os.mkdir(base)
except EnvironmentError as err:
if err.errno != errno.EEXIST:
raise
cPickle.dump(stats, open(dest, 'wb'), -1)
def compile_user_manual_translations(self): def compile_user_manual_translations(self):
self.info('Compiling user manual translations...') self.info('Compiling user manual translations...')
srcbase = self.j(self.d(self.SRC), 'translations', 'manual') srcbase = self.j(self.d(self.SRC), 'translations', 'manual')
destbase = self.j(self.d(self.SRC), 'manual', 'locale') destbase = self.j(self.d(self.SRC), 'manual', 'locale')
complete = {} complete = {}
all_stats = defaultdict(lambda : {'translated': 0, 'untranslated': 0})
files = []
for x in os.listdir(srcbase): for x in os.listdir(srcbase):
q = self.j(srcbase, x) q = self.j(srcbase, x)
if not os.path.isdir(q): if not os.path.isdir(q):
@ -356,27 +435,27 @@ class Translations(POT): # {{{
if os.path.exists(dest): if os.path.exists(dest):
shutil.rmtree(dest) shutil.rmtree(dest)
os.makedirs(dest) os.makedirs(dest)
jobs = []
for po in os.listdir(q): for po in os.listdir(q):
if not po.endswith('.po'): if not po.endswith('.po'):
continue continue
jobs.append([ mofile = self.j(dest, po.rpartition('.')[0] + '.mo')
'msgfmt', '--statistics', '-o', self.j( files.append((self.j(q, po), mofile))
dest, po.rpartition('.')[0] + '.mo'), self.j(q, po)])
stats = tuple(parallel_check_output(jobs, self.info)) def handle_stats(src, nums):
translated = untranslated = 0 locale = self.b(self.d(src))
for line in stats: stats = all_stats[locale]
nums = tuple(map(int, re.findall(r'\d+', line))) stats['translated'] += nums[0]
translated += nums[0]
if len(nums) > 1: if len(nums) > 1:
untranslated += nums[1] stats['untranslated'] += nums[1]
stats = {'translated':translated, 'untranslated':untranslated}
with open(self.j(self.d(dest), 'stats.json'), 'wb') as f: self.compile_group(files, handle_stats=handle_stats)
for locale, stats in all_stats.iteritems():
with open(self.j(srcbase, locale, 'stats.json'), 'wb') as f:
json.dump(stats, f) json.dump(stats, f)
total = translated + untranslated total = stats['translated'] + stats['untranslated']
# Raise the 30% threshold in the future # Raise the 30% threshold in the future
if total and (translated / float(total)) > 0.3: if total and (stats['translated'] / float(total)) > 0.3:
complete[x] = stats complete[locale] = stats
with open(self.j(destbase, 'completed.json'), 'wb') as f: with open(self.j(destbase, 'completed.json'), 'wb') as f:
json.dump(complete, f, indent=True, sort_keys=True) json.dump(complete, f, indent=True, sort_keys=True)
@ -389,6 +468,7 @@ class Translations(POT): # {{{
destbase = self.j(self.d(self.SRC), 'manual', 'locale') destbase = self.j(self.d(self.SRC), 'manual', 'locale')
if os.path.exists(destbase): if os.path.exists(destbase):
shutil.rmtree(destbase) shutil.rmtree(destbase)
shutil.rmtree(self.cache_dir)
# }}} # }}}