Code to fetch hyphenation dictionaries

This commit is contained in:
Kovid Goyal 2019-11-30 10:59:45 +05:30
parent f57d45de8d
commit 1c54eb1648
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 119 additions and 3 deletions

1
.gitignore vendored
View File

@ -15,6 +15,7 @@ build
dist
docs
resources/localization
resources/hyphenation
resources/scripts.calibre_msgpack
resources/ebook-convert-complete.calibre_msgpack
resources/builtin_recipes.xml

View File

@ -6,7 +6,8 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, re, os, platform, subprocess, time, errno
import sys, re, os, platform, subprocess, time, errno, tempfile, shutil
from contextlib import contextmanager
is64bit = platform.architecture()[0] == '64bit'
iswindows = re.search('win(32|64)', sys.platform)
@ -289,6 +290,14 @@ class Command(object):
warnings.append((args, kwargs))
sys.stdout.flush()
@contextmanager
def temp_dir(self, **kw):
ans = tempfile.mkdtemp(**kw)
try:
yield ans
finally:
shutil.rmtree(ans)
def installer_name(ext, is64bit=False):
if is64bit and ext == 'msi':

View File

@ -21,7 +21,7 @@ __all__ = [
'upload_user_manual', 'upload_demo', 'reupload',
'stage1', 'stage2', 'stage3', 'stage4', 'stage5', 'publish', 'publish_betas',
'linux', 'linux32', 'linux64', 'win', 'win32', 'win64', 'osx', 'build_dep',
'export_packages',
'export_packages', 'hyphenation'
]
from setup.installers import Linux, Win, OSX, Linux32, Linux64, Win32, Win64, ExtDev, BuildDep, ExportPackages
@ -45,6 +45,9 @@ build = Build()
from setup.mathjax import MathJax
mathjax = MathJax()
from setup.hyphenation import Hyphenation
hyphenation = Hyphenation()
from setup.git_version import GitVersion
git_version = GitVersion()

103
setup/hyphenation.py Normal file
View File

@ -0,0 +1,103 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import glob
import json
import os
import shutil
import subprocess
from io import BytesIO
from zipfile import ZipFile
from setup import Command, download_securely
URL = 'https://github.com/LibreOffice/dictionaries/archive/master.zip'
def locales_from_dicts(dicts):
ans = {}
for path in dicts:
name = bname = os.path.basename(path)
name = name[len('hyph_'):-len('.dic')]
ans[name.replace('-', '_')] = bname
return ans
def locales_from_xcu(xcu, dicts):
from lxml import etree
with open(xcu, 'rb') as f:
root = etree.fromstring(f.read())
ans = {}
dicts = {os.path.basename(x) for x in dicts}
for value in root.xpath('//*[contains(text(),"DICT_HYPH")]'):
node = value.getparent().getparent()
locales = path = None
for prop in node:
name = prop.get('{http://openoffice.org/2001/registry}name')
if name == 'Locales':
locales = [x.replace('-', '_') for x in prop[0].text.split()]
elif name == 'Locations':
path = prop[0].text.strip().split('/')[-1]
if locales and path in dicts:
for locale in locales:
ans[locale] = path
return ans
def process_dictionaries(src, output_dir):
locale_data = {}
for x in os.listdir(src):
q = os.path.join(src, x)
if not os.path.isdir(q):
continue
dicts = tuple(glob.glob(os.path.join(q, 'hyph_*.dic')))
if not dicts:
continue
xcu = os.path.join(q, 'dictionaries.xcu')
locales = (
locales_from_xcu(xcu, dicts) if os.path.exists(xcu) else
locales_from_dicts(dicts))
if locales:
locale_data.update(locales)
for d in dicts:
shutil.copyfile(
d, os.path.join(output_dir, os.path.basename(d)))
data = json.dumps(locale_data, indent=2)
if not isinstance(data, bytes):
data = data.encode('utf-8')
with open(os.path.join(output_dir, 'locales.json'), 'wb') as f:
f.write(data)
class Hyphenation(Command):
description = 'Download the hyphenation dictionaries'
def add_options(self, parser):
pass
# parser.add_option('--path-to-mathjax', help='Path to the MathJax source code')
@property
def hyphenation_dir(self):
return self.j(self.RESOURCES, 'hyphenation')
def clean(self):
if os.path.exists(self.hyphenation_dir):
shutil.rmtree(self.hyphenation_dir)
def run(self, opts):
self.clean()
os.makedirs(self.hyphenation_dir)
self.info('Downloading hyphenation dictionaries...')
with self.temp_dir() as src, ZipFile(BytesIO(download_securely(URL))) as zf, self.temp_dir() as output_dir:
zf.extractall(src)
if len(os.listdir(src)) == 1:
src = os.path.join(src, os.listdir(src)[0])
process_dictionaries(src, output_dir)
dics = [x for x in os.listdir(output_dir) if x.endswith('.dic')]
subprocess.check_call([
'tar', '-cJf', os.path.join(self.hyphenation_dir, 'dictionaries.tar.xz')] + dics
, env={'XZ_OPT': '-9e -T0'}, cwd=output_dir)
shutil.copy(self.j(output_dir, 'locales.json'), self.hyphenation_dir)

View File

@ -199,7 +199,7 @@ class RapydScript(Command): # {{{
class Resources(Command): # {{{
description = 'Compile various needed calibre resources'
sub_commands = ['kakasi', 'mathjax', 'rapydscript']
sub_commands = ['kakasi', 'mathjax', 'rapydscript', 'hyphenation']
def run(self, opts):
from calibre.utils.serialize import msgpack_dumps