From 1c54eb1648fd0fce08a3e75c192c3dba1b92e5db Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 30 Nov 2019 10:59:45 +0530 Subject: [PATCH] Code to fetch hyphenation dictionaries --- .gitignore | 1 + setup/__init__.py | 11 ++++- setup/commands.py | 5 ++- setup/hyphenation.py | 103 +++++++++++++++++++++++++++++++++++++++++++ setup/resources.py | 2 +- 5 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 setup/hyphenation.py diff --git a/.gitignore b/.gitignore index 1491de82a8..41ac98075b 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ build dist docs resources/localization +resources/hyphenation resources/scripts.calibre_msgpack resources/ebook-convert-complete.calibre_msgpack resources/builtin_recipes.xml diff --git a/setup/__init__.py b/setup/__init__.py index 69450c8948..f312ffb048 100644 --- a/setup/__init__.py +++ b/setup/__init__.py @@ -6,7 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, re, os, platform, subprocess, time, errno +import sys, re, os, platform, subprocess, time, errno, tempfile, shutil +from contextlib import contextmanager is64bit = platform.architecture()[0] == '64bit' iswindows = re.search('win(32|64)', sys.platform) @@ -289,6 +290,14 @@ class Command(object): warnings.append((args, kwargs)) sys.stdout.flush() + @contextmanager + def temp_dir(self, **kw): + ans = tempfile.mkdtemp(**kw) + try: + yield ans + finally: + shutil.rmtree(ans) + def installer_name(ext, is64bit=False): if is64bit and ext == 'msi': diff --git a/setup/commands.py b/setup/commands.py index 3cb61a690a..8887bc1303 100644 --- a/setup/commands.py +++ b/setup/commands.py @@ -21,7 +21,7 @@ __all__ = [ 'upload_user_manual', 'upload_demo', 'reupload', 'stage1', 'stage2', 'stage3', 'stage4', 'stage5', 'publish', 'publish_betas', 'linux', 'linux32', 'linux64', 'win', 'win32', 'win64', 'osx', 'build_dep', - 'export_packages', + 'export_packages', 'hyphenation' ] from setup.installers import Linux, Win, OSX, Linux32, Linux64, Win32, Win64, ExtDev, BuildDep, ExportPackages @@ -45,6 +45,9 @@ build = Build() from setup.mathjax import MathJax mathjax = MathJax() +from setup.hyphenation import Hyphenation +hyphenation = Hyphenation() + from setup.git_version import GitVersion git_version = GitVersion() diff --git a/setup/hyphenation.py b/setup/hyphenation.py new file mode 100644 index 0000000000..e020ce30c1 --- /dev/null +++ b/setup/hyphenation.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +# License: GPLv3 Copyright: 2019, Kovid Goyal +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob +import json +import os +import shutil +import subprocess +from io import BytesIO +from zipfile import ZipFile + +from setup import Command, download_securely + +URL = 'https://github.com/LibreOffice/dictionaries/archive/master.zip' + + +def locales_from_dicts(dicts): + ans = {} + for path in dicts: + name = bname = os.path.basename(path) + name = name[len('hyph_'):-len('.dic')] + ans[name.replace('-', '_')] = bname + return ans + + +def locales_from_xcu(xcu, dicts): + from lxml import etree + with open(xcu, 'rb') as f: + root = etree.fromstring(f.read()) + ans = {} + dicts = {os.path.basename(x) for x in dicts} + for value in root.xpath('//*[contains(text(),"DICT_HYPH")]'): + node = value.getparent().getparent() + locales = path = None + for prop in node: + name = prop.get('{http://openoffice.org/2001/registry}name') + if name == 'Locales': + locales = [x.replace('-', '_') for x in prop[0].text.split()] + elif name == 'Locations': + path = prop[0].text.strip().split('/')[-1] + if locales and path in dicts: + for locale in locales: + ans[locale] = path + return ans + + +def process_dictionaries(src, output_dir): + locale_data = {} + for x in os.listdir(src): + q = os.path.join(src, x) + if not os.path.isdir(q): + continue + dicts = tuple(glob.glob(os.path.join(q, 'hyph_*.dic'))) + if not dicts: + continue + xcu = os.path.join(q, 'dictionaries.xcu') + locales = ( + locales_from_xcu(xcu, dicts) if os.path.exists(xcu) else + locales_from_dicts(dicts)) + if locales: + locale_data.update(locales) + for d in dicts: + shutil.copyfile( + d, os.path.join(output_dir, os.path.basename(d))) + data = json.dumps(locale_data, indent=2) + if not isinstance(data, bytes): + data = data.encode('utf-8') + with open(os.path.join(output_dir, 'locales.json'), 'wb') as f: + f.write(data) + + +class Hyphenation(Command): + + description = 'Download the hyphenation dictionaries' + + def add_options(self, parser): + pass + # parser.add_option('--path-to-mathjax', help='Path to the MathJax source code') + + @property + def hyphenation_dir(self): + return self.j(self.RESOURCES, 'hyphenation') + + def clean(self): + if os.path.exists(self.hyphenation_dir): + shutil.rmtree(self.hyphenation_dir) + + def run(self, opts): + self.clean() + os.makedirs(self.hyphenation_dir) + self.info('Downloading hyphenation dictionaries...') + with self.temp_dir() as src, ZipFile(BytesIO(download_securely(URL))) as zf, self.temp_dir() as output_dir: + zf.extractall(src) + if len(os.listdir(src)) == 1: + src = os.path.join(src, os.listdir(src)[0]) + process_dictionaries(src, output_dir) + dics = [x for x in os.listdir(output_dir) if x.endswith('.dic')] + subprocess.check_call([ + 'tar', '-cJf', os.path.join(self.hyphenation_dir, 'dictionaries.tar.xz')] + dics + , env={'XZ_OPT': '-9e -T0'}, cwd=output_dir) + shutil.copy(self.j(output_dir, 'locales.json'), self.hyphenation_dir) diff --git a/setup/resources.py b/setup/resources.py index 11b3a3938d..a68fac5b99 100644 --- a/setup/resources.py +++ b/setup/resources.py @@ -199,7 +199,7 @@ class RapydScript(Command): # {{{ class Resources(Command): # {{{ description = 'Compile various needed calibre resources' - sub_commands = ['kakasi', 'mathjax', 'rapydscript'] + sub_commands = ['kakasi', 'mathjax', 'rapydscript', 'hyphenation'] def run(self, opts): from calibre.utils.serialize import msgpack_dumps