Function to hyphenate individual words, sans surrounding punctuation

2025-07-09 03:04:10 -04:00 · 2019-12-02 14:44:54 +05:30 · 2019-12-02 14:44:54 +05:30 · 1e5de81544
commit 1e5de81544
parent aa1f52460e
4 changed files with 100 additions and 20 deletions
--- a/src/calibre/utils/hyphenation/dictionaries.py
+++ b/src/calibre/utils/hyphenation/dictionaries.py
@ -88,14 +88,20 @@ def is_cache_up_to_date(cache_path):
    return False


-def path_to_dictionary(dictionary_name, cache_callback=None):
-    cd = getattr(path_to_dictionary, 'cache_dir', None) or cache_dir()
+@lru_cache()
+def get_cache_path(cd):
    cache_path = os.path.join(cd, 'hyphenation')
    try:
        os.makedirs(cache_path)
    except EnvironmentError as err:
        if err.errno != errno.EEXIST:
            raise
+    return cache_path
+
+
+def path_to_dictionary(dictionary_name, cache_callback=None):
+    cd = getattr(path_to_dictionary, 'cache_dir', None) or cache_dir()
+    cache_path = get_cache_path(cd)
    if not is_cache_up_to_date(cache_path):
        extract_dicts(cache_path)
        if cache_callback is not None:
--- a/src/calibre/utils/hyphenation/hyphen.c
+++ b/src/calibre/utils/hyphenation/hyphen.c
@ -47,13 +47,14 @@ get_dict_from_args(PyObject *args) {

 static PyObject*
 simple_hyphenate(PyObject *self, PyObject *args) {
-    char hyphenated_word[2*MAX_CHARS], hyphens[MAX_CHARS * 3] = {0}, *word_str;
+    char hyphenated_word[2*MAX_CHARS] = {0}, hyphens[MAX_CHARS * 3] = {0}, *word_str;
 	PyObject *dict_obj;
+	char **rep = NULL; int *pos = NULL, *cut = NULL;

 	HyphenDict *dict = get_dict_from_args(args);
 	if (!dict) return NULL;
    if (!PyArg_ParseTuple(args, "Oes", &dict_obj, &dict->cset, &word_str)) return NULL;
-    size_t wd_size = strlen(word_str), hwl = 0;
+    size_t wd_size = strlen(word_str);

    if (wd_size >= MAX_CHARS) {
        PyErr_Format(PyExc_ValueError, "Word to be hyphenated (%s) may have at most %u characters, has %zu.", word_str, MAX_CHARS-1, wd_size);
@ -61,22 +62,21 @@ simple_hyphenate(PyObject *self, PyObject *args) {
        return NULL;
    }

-	// we use the simple (old) algorithm since we dont handle replacements
-	// anyway
-    if (hnj_hyphen_hyphenate(dict, word_str, (int)wd_size, hyphens)) {
+    if (hnj_hyphen_hyphenate2(dict, word_str, (int)wd_size, hyphens, hyphenated_word, &rep, &pos, &cut)) {
        PyErr_Format(PyExc_ValueError, "Cannot hyphenate word: %s", word_str);
-    } else {
-		for (size_t i = 0; i < wd_size; i++) {
-			if (hyphens[i] & 1) {
-				hyphenated_word[hwl++] = '=';
-			}
-			hyphenated_word[hwl++] = word_str[i];
-		}
 	}
 	PyMem_Free(word_str);
+	if (rep) {
+        PyErr_Format(PyExc_ValueError, "Cannot hyphenate word as it requires replacements: %s", word_str);
+		for (size_t i = 0; i < wd_size; i++) {
+			if (rep[i]) free(rep[i]);
+		}
+		free(rep);
+	}
+	free(pos); free(cut);
 	if (PyErr_Occurred()) return NULL;

-	return PyUnicode_Decode(hyphenated_word, hwl, dict->cset, "replace");
+	return PyUnicode_Decode(hyphenated_word, strlen(hyphenated_word), dict->cset, "replace");
 }


--- a/src/calibre/utils/hyphenation/hyphenate.py
+++ b/src/calibre/utils/hyphenation/hyphenate.py
@ -4,11 +4,17 @@

 from __future__ import absolute_import, division, print_function, unicode_literals

-# TODO: lower case word? remove trailing punctuation. abort early if contains = or length < 4 or length > 99
-# TODO: test with replacement words
-
+import os

 import regex
+
+from calibre.constants import plugins
+from calibre.utils.hyphenation.dictionaries import (
+    dictionary_name_for_locale, path_to_dictionary
+)
+from polyglot.builtins import unicode_type
+from polyglot.functools import lru_cache
+
 REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE


@ -31,3 +37,49 @@ def remove_punctuation(word):
        count = len(word) - len(nword)
        suffix, word = word[-count:], nword
    return prefix, word, suffix
+
+
+hyphen = None
+
+
+@lru_cache()
+def dictionary_for_locale(locale):
+    global hyphen
+    name = dictionary_name_for_locale(locale)
+    if name is not None:
+        path = path_to_dictionary(name)
+        if hyphen is None:
+            hyphen, hyphen_err = plugins['hyphen']
+            if hyphen_err:
+                raise RuntimeError('Failed to load the hyphen plugin with error: {}'.format(hyphen_err))
+        fd = os.open(path, getattr(os, 'O_BINARY', 0) | os.O_RDONLY)
+        return hyphen.load_dictionary(fd)
+
+
+def add_soft_hyphens(word, dictionary, hyphen_char='\u00ad'):
+    word = unicode_type(word)
+    if len(word) > 99 or '=' in word:
+        return word
+    prefix, q, suffix = remove_punctuation(word)
+    q = q.replace(hyphen_char, '')
+    if len(q) < 4:
+        return word
+    lq = q.lower()  # the hyphen library needs lowercase words to work
+    try:
+        ans = hyphen.simple_hyphenate(dictionary, lq)
+    except ValueError:
+        # Can happen is the word requires non-standard hyphenation (i.e.
+        # replacements)
+        return word
+    parts = ans.split('=')
+    if len(parts) == 1:
+        return word
+    if lq != q:
+        aparts = []
+        pos = 0
+        for p in parts:
+            lp = len(p)
+            aparts.append(q[pos:pos+lp])
+            pos += lp
+        parts = aparts
+    return prefix + hyphen_char.join(parts) + suffix
--- a/src/calibre/utils/hyphenation/test_hyphenation.py
+++ b/src/calibre/utils/hyphenation/test_hyphenation.py
@ -10,9 +10,11 @@ import unittest

 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.hyphenation.dictionaries import (
-    dictionary_name_for_locale, path_to_dictionary
+    dictionary_name_for_locale, get_cache_path, path_to_dictionary
+)
+from calibre.utils.hyphenation.hyphenate import (
+    add_soft_hyphens, dictionary_for_locale, remove_punctuation
 )
-from calibre.utils.hyphenation.hyphenate import remove_punctuation


 class TestHyphenation(unittest.TestCase):
@ -23,9 +25,15 @@ class TestHyphenation(unittest.TestCase):
    def setUpClass(cls):
        tdir = PersistentTemporaryDirectory()
        path_to_dictionary.cache_dir = tdir
+        dictionary_name_for_locale.cache_clear()
+        dictionary_for_locale.cache_clear()
+        get_cache_path.cache_clear()

    @classmethod
    def tearDownClass(cls):
+        dictionary_name_for_locale.cache_clear()
+        dictionary_for_locale.cache_clear()
+        get_cache_path.cache_clear()
        try:
            shutil.rmtree(path_to_dictionary.cache_dir)
        except EnvironmentError:
@ -68,6 +76,20 @@ class TestHyphenation(unittest.TestCase):
        self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.'))
        self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!'))

+    def test_add_soft_hyphens(self):
+        dictionary = dictionary_for_locale('en')
+
+        def t(word, expected):
+            self.ae(add_soft_hyphens(word, dictionary, '='), expected)
+
+        t('beautiful', 'beau=ti=ful')
+        t('beautiful.', 'beau=ti=ful.')
+        t('"beautiful.', '"beau=ti=ful.')
+        t('BeauTiful', 'Beau=Ti=ful')
+
+        dictionary = dictionary_for_locale('hu')
+        t('asszonnyal', 'asszonnyal')
+

 def find_tests():
    return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)