From 1e5de81544127bf147a45501da078767c21cf74d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 2 Dec 2019 14:44:54 +0530 Subject: [PATCH] Function to hyphenate individual words, sans surrounding punctuation --- src/calibre/utils/hyphenation/dictionaries.py | 10 +++- src/calibre/utils/hyphenation/hyphen.c | 26 ++++----- src/calibre/utils/hyphenation/hyphenate.py | 58 ++++++++++++++++++- .../utils/hyphenation/test_hyphenation.py | 26 ++++++++- 4 files changed, 100 insertions(+), 20 deletions(-) diff --git a/src/calibre/utils/hyphenation/dictionaries.py b/src/calibre/utils/hyphenation/dictionaries.py index aa1df4d551..e2af459e46 100644 --- a/src/calibre/utils/hyphenation/dictionaries.py +++ b/src/calibre/utils/hyphenation/dictionaries.py @@ -88,14 +88,20 @@ def is_cache_up_to_date(cache_path): return False -def path_to_dictionary(dictionary_name, cache_callback=None): - cd = getattr(path_to_dictionary, 'cache_dir', None) or cache_dir() +@lru_cache() +def get_cache_path(cd): cache_path = os.path.join(cd, 'hyphenation') try: os.makedirs(cache_path) except EnvironmentError as err: if err.errno != errno.EEXIST: raise + return cache_path + + +def path_to_dictionary(dictionary_name, cache_callback=None): + cd = getattr(path_to_dictionary, 'cache_dir', None) or cache_dir() + cache_path = get_cache_path(cd) if not is_cache_up_to_date(cache_path): extract_dicts(cache_path) if cache_callback is not None: diff --git a/src/calibre/utils/hyphenation/hyphen.c b/src/calibre/utils/hyphenation/hyphen.c index 6c48cd9812..153c6f8c99 100644 --- a/src/calibre/utils/hyphenation/hyphen.c +++ b/src/calibre/utils/hyphenation/hyphen.c @@ -47,13 +47,14 @@ get_dict_from_args(PyObject *args) { static PyObject* simple_hyphenate(PyObject *self, PyObject *args) { - char hyphenated_word[2*MAX_CHARS], hyphens[MAX_CHARS * 3] = {0}, *word_str; + char hyphenated_word[2*MAX_CHARS] = {0}, hyphens[MAX_CHARS * 3] = {0}, *word_str; PyObject *dict_obj; + char **rep = NULL; int *pos = NULL, *cut = NULL; HyphenDict *dict = get_dict_from_args(args); if (!dict) return NULL; if (!PyArg_ParseTuple(args, "Oes", &dict_obj, &dict->cset, &word_str)) return NULL; - size_t wd_size = strlen(word_str), hwl = 0; + size_t wd_size = strlen(word_str); if (wd_size >= MAX_CHARS) { PyErr_Format(PyExc_ValueError, "Word to be hyphenated (%s) may have at most %u characters, has %zu.", word_str, MAX_CHARS-1, wd_size); @@ -61,22 +62,21 @@ simple_hyphenate(PyObject *self, PyObject *args) { return NULL; } - // we use the simple (old) algorithm since we dont handle replacements - // anyway - if (hnj_hyphen_hyphenate(dict, word_str, (int)wd_size, hyphens)) { + if (hnj_hyphen_hyphenate2(dict, word_str, (int)wd_size, hyphens, hyphenated_word, &rep, &pos, &cut)) { PyErr_Format(PyExc_ValueError, "Cannot hyphenate word: %s", word_str); - } else { - for (size_t i = 0; i < wd_size; i++) { - if (hyphens[i] & 1) { - hyphenated_word[hwl++] = '='; - } - hyphenated_word[hwl++] = word_str[i]; - } } PyMem_Free(word_str); + if (rep) { + PyErr_Format(PyExc_ValueError, "Cannot hyphenate word as it requires replacements: %s", word_str); + for (size_t i = 0; i < wd_size; i++) { + if (rep[i]) free(rep[i]); + } + free(rep); + } + free(pos); free(cut); if (PyErr_Occurred()) return NULL; - return PyUnicode_Decode(hyphenated_word, hwl, dict->cset, "replace"); + return PyUnicode_Decode(hyphenated_word, strlen(hyphenated_word), dict->cset, "replace"); } diff --git a/src/calibre/utils/hyphenation/hyphenate.py b/src/calibre/utils/hyphenation/hyphenate.py index bc4dc447bb..90a4ca2344 100644 --- a/src/calibre/utils/hyphenation/hyphenate.py +++ b/src/calibre/utils/hyphenation/hyphenate.py @@ -4,11 +4,17 @@ from __future__ import absolute_import, division, print_function, unicode_literals -# TODO: lower case word? remove trailing punctuation. abort early if contains = or length < 4 or length > 99 -# TODO: test with replacement words - +import os import regex + +from calibre.constants import plugins +from calibre.utils.hyphenation.dictionaries import ( + dictionary_name_for_locale, path_to_dictionary +) +from polyglot.builtins import unicode_type +from polyglot.functools import lru_cache + REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE @@ -31,3 +37,49 @@ def remove_punctuation(word): count = len(word) - len(nword) suffix, word = word[-count:], nword return prefix, word, suffix + + +hyphen = None + + +@lru_cache() +def dictionary_for_locale(locale): + global hyphen + name = dictionary_name_for_locale(locale) + if name is not None: + path = path_to_dictionary(name) + if hyphen is None: + hyphen, hyphen_err = plugins['hyphen'] + if hyphen_err: + raise RuntimeError('Failed to load the hyphen plugin with error: {}'.format(hyphen_err)) + fd = os.open(path, getattr(os, 'O_BINARY', 0) | os.O_RDONLY) + return hyphen.load_dictionary(fd) + + +def add_soft_hyphens(word, dictionary, hyphen_char='\u00ad'): + word = unicode_type(word) + if len(word) > 99 or '=' in word: + return word + prefix, q, suffix = remove_punctuation(word) + q = q.replace(hyphen_char, '') + if len(q) < 4: + return word + lq = q.lower() # the hyphen library needs lowercase words to work + try: + ans = hyphen.simple_hyphenate(dictionary, lq) + except ValueError: + # Can happen is the word requires non-standard hyphenation (i.e. + # replacements) + return word + parts = ans.split('=') + if len(parts) == 1: + return word + if lq != q: + aparts = [] + pos = 0 + for p in parts: + lp = len(p) + aparts.append(q[pos:pos+lp]) + pos += lp + parts = aparts + return prefix + hyphen_char.join(parts) + suffix diff --git a/src/calibre/utils/hyphenation/test_hyphenation.py b/src/calibre/utils/hyphenation/test_hyphenation.py index 3795581691..f5060d963f 100644 --- a/src/calibre/utils/hyphenation/test_hyphenation.py +++ b/src/calibre/utils/hyphenation/test_hyphenation.py @@ -10,9 +10,11 @@ import unittest from calibre.ptempfile import PersistentTemporaryDirectory from calibre.utils.hyphenation.dictionaries import ( - dictionary_name_for_locale, path_to_dictionary + dictionary_name_for_locale, get_cache_path, path_to_dictionary +) +from calibre.utils.hyphenation.hyphenate import ( + add_soft_hyphens, dictionary_for_locale, remove_punctuation ) -from calibre.utils.hyphenation.hyphenate import remove_punctuation class TestHyphenation(unittest.TestCase): @@ -23,9 +25,15 @@ class TestHyphenation(unittest.TestCase): def setUpClass(cls): tdir = PersistentTemporaryDirectory() path_to_dictionary.cache_dir = tdir + dictionary_name_for_locale.cache_clear() + dictionary_for_locale.cache_clear() + get_cache_path.cache_clear() @classmethod def tearDownClass(cls): + dictionary_name_for_locale.cache_clear() + dictionary_for_locale.cache_clear() + get_cache_path.cache_clear() try: shutil.rmtree(path_to_dictionary.cache_dir) except EnvironmentError: @@ -68,6 +76,20 @@ class TestHyphenation(unittest.TestCase): self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.')) self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!')) + def test_add_soft_hyphens(self): + dictionary = dictionary_for_locale('en') + + def t(word, expected): + self.ae(add_soft_hyphens(word, dictionary, '='), expected) + + t('beautiful', 'beau=ti=ful') + t('beautiful.', 'beau=ti=ful.') + t('"beautiful.', '"beau=ti=ful.') + t('BeauTiful', 'Beau=Ti=ful') + + dictionary = dictionary_for_locale('hu') + t('asszonnyal', 'asszonnyal') + def find_tests(): return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)