mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Function to hyphenate individual words, sans surrounding punctuation
This commit is contained in:
parent
aa1f52460e
commit
1e5de81544
@ -88,14 +88,20 @@ def is_cache_up_to_date(cache_path):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def path_to_dictionary(dictionary_name, cache_callback=None):
|
@lru_cache()
|
||||||
cd = getattr(path_to_dictionary, 'cache_dir', None) or cache_dir()
|
def get_cache_path(cd):
|
||||||
cache_path = os.path.join(cd, 'hyphenation')
|
cache_path = os.path.join(cd, 'hyphenation')
|
||||||
try:
|
try:
|
||||||
os.makedirs(cache_path)
|
os.makedirs(cache_path)
|
||||||
except EnvironmentError as err:
|
except EnvironmentError as err:
|
||||||
if err.errno != errno.EEXIST:
|
if err.errno != errno.EEXIST:
|
||||||
raise
|
raise
|
||||||
|
return cache_path
|
||||||
|
|
||||||
|
|
||||||
|
def path_to_dictionary(dictionary_name, cache_callback=None):
|
||||||
|
cd = getattr(path_to_dictionary, 'cache_dir', None) or cache_dir()
|
||||||
|
cache_path = get_cache_path(cd)
|
||||||
if not is_cache_up_to_date(cache_path):
|
if not is_cache_up_to_date(cache_path):
|
||||||
extract_dicts(cache_path)
|
extract_dicts(cache_path)
|
||||||
if cache_callback is not None:
|
if cache_callback is not None:
|
||||||
|
@ -47,13 +47,14 @@ get_dict_from_args(PyObject *args) {
|
|||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
simple_hyphenate(PyObject *self, PyObject *args) {
|
simple_hyphenate(PyObject *self, PyObject *args) {
|
||||||
char hyphenated_word[2*MAX_CHARS], hyphens[MAX_CHARS * 3] = {0}, *word_str;
|
char hyphenated_word[2*MAX_CHARS] = {0}, hyphens[MAX_CHARS * 3] = {0}, *word_str;
|
||||||
PyObject *dict_obj;
|
PyObject *dict_obj;
|
||||||
|
char **rep = NULL; int *pos = NULL, *cut = NULL;
|
||||||
|
|
||||||
HyphenDict *dict = get_dict_from_args(args);
|
HyphenDict *dict = get_dict_from_args(args);
|
||||||
if (!dict) return NULL;
|
if (!dict) return NULL;
|
||||||
if (!PyArg_ParseTuple(args, "Oes", &dict_obj, &dict->cset, &word_str)) return NULL;
|
if (!PyArg_ParseTuple(args, "Oes", &dict_obj, &dict->cset, &word_str)) return NULL;
|
||||||
size_t wd_size = strlen(word_str), hwl = 0;
|
size_t wd_size = strlen(word_str);
|
||||||
|
|
||||||
if (wd_size >= MAX_CHARS) {
|
if (wd_size >= MAX_CHARS) {
|
||||||
PyErr_Format(PyExc_ValueError, "Word to be hyphenated (%s) may have at most %u characters, has %zu.", word_str, MAX_CHARS-1, wd_size);
|
PyErr_Format(PyExc_ValueError, "Word to be hyphenated (%s) may have at most %u characters, has %zu.", word_str, MAX_CHARS-1, wd_size);
|
||||||
@ -61,22 +62,21 @@ simple_hyphenate(PyObject *self, PyObject *args) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// we use the simple (old) algorithm since we dont handle replacements
|
if (hnj_hyphen_hyphenate2(dict, word_str, (int)wd_size, hyphens, hyphenated_word, &rep, &pos, &cut)) {
|
||||||
// anyway
|
|
||||||
if (hnj_hyphen_hyphenate(dict, word_str, (int)wd_size, hyphens)) {
|
|
||||||
PyErr_Format(PyExc_ValueError, "Cannot hyphenate word: %s", word_str);
|
PyErr_Format(PyExc_ValueError, "Cannot hyphenate word: %s", word_str);
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < wd_size; i++) {
|
|
||||||
if (hyphens[i] & 1) {
|
|
||||||
hyphenated_word[hwl++] = '=';
|
|
||||||
}
|
|
||||||
hyphenated_word[hwl++] = word_str[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
PyMem_Free(word_str);
|
PyMem_Free(word_str);
|
||||||
|
if (rep) {
|
||||||
|
PyErr_Format(PyExc_ValueError, "Cannot hyphenate word as it requires replacements: %s", word_str);
|
||||||
|
for (size_t i = 0; i < wd_size; i++) {
|
||||||
|
if (rep[i]) free(rep[i]);
|
||||||
|
}
|
||||||
|
free(rep);
|
||||||
|
}
|
||||||
|
free(pos); free(cut);
|
||||||
if (PyErr_Occurred()) return NULL;
|
if (PyErr_Occurred()) return NULL;
|
||||||
|
|
||||||
return PyUnicode_Decode(hyphenated_word, hwl, dict->cset, "replace");
|
return PyUnicode_Decode(hyphenated_word, strlen(hyphenated_word), dict->cset, "replace");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,11 +4,17 @@
|
|||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
# TODO: lower case word? remove trailing punctuation. abort early if contains = or length < 4 or length > 99
|
import os
|
||||||
# TODO: test with replacement words
|
|
||||||
|
|
||||||
|
|
||||||
import regex
|
import regex
|
||||||
|
|
||||||
|
from calibre.constants import plugins
|
||||||
|
from calibre.utils.hyphenation.dictionaries import (
|
||||||
|
dictionary_name_for_locale, path_to_dictionary
|
||||||
|
)
|
||||||
|
from polyglot.builtins import unicode_type
|
||||||
|
from polyglot.functools import lru_cache
|
||||||
|
|
||||||
REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE
|
REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE
|
||||||
|
|
||||||
|
|
||||||
@ -31,3 +37,49 @@ def remove_punctuation(word):
|
|||||||
count = len(word) - len(nword)
|
count = len(word) - len(nword)
|
||||||
suffix, word = word[-count:], nword
|
suffix, word = word[-count:], nword
|
||||||
return prefix, word, suffix
|
return prefix, word, suffix
|
||||||
|
|
||||||
|
|
||||||
|
hyphen = None
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def dictionary_for_locale(locale):
|
||||||
|
global hyphen
|
||||||
|
name = dictionary_name_for_locale(locale)
|
||||||
|
if name is not None:
|
||||||
|
path = path_to_dictionary(name)
|
||||||
|
if hyphen is None:
|
||||||
|
hyphen, hyphen_err = plugins['hyphen']
|
||||||
|
if hyphen_err:
|
||||||
|
raise RuntimeError('Failed to load the hyphen plugin with error: {}'.format(hyphen_err))
|
||||||
|
fd = os.open(path, getattr(os, 'O_BINARY', 0) | os.O_RDONLY)
|
||||||
|
return hyphen.load_dictionary(fd)
|
||||||
|
|
||||||
|
|
||||||
|
def add_soft_hyphens(word, dictionary, hyphen_char='\u00ad'):
|
||||||
|
word = unicode_type(word)
|
||||||
|
if len(word) > 99 or '=' in word:
|
||||||
|
return word
|
||||||
|
prefix, q, suffix = remove_punctuation(word)
|
||||||
|
q = q.replace(hyphen_char, '')
|
||||||
|
if len(q) < 4:
|
||||||
|
return word
|
||||||
|
lq = q.lower() # the hyphen library needs lowercase words to work
|
||||||
|
try:
|
||||||
|
ans = hyphen.simple_hyphenate(dictionary, lq)
|
||||||
|
except ValueError:
|
||||||
|
# Can happen is the word requires non-standard hyphenation (i.e.
|
||||||
|
# replacements)
|
||||||
|
return word
|
||||||
|
parts = ans.split('=')
|
||||||
|
if len(parts) == 1:
|
||||||
|
return word
|
||||||
|
if lq != q:
|
||||||
|
aparts = []
|
||||||
|
pos = 0
|
||||||
|
for p in parts:
|
||||||
|
lp = len(p)
|
||||||
|
aparts.append(q[pos:pos+lp])
|
||||||
|
pos += lp
|
||||||
|
parts = aparts
|
||||||
|
return prefix + hyphen_char.join(parts) + suffix
|
||||||
|
@ -10,9 +10,11 @@ import unittest
|
|||||||
|
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
from calibre.utils.hyphenation.dictionaries import (
|
from calibre.utils.hyphenation.dictionaries import (
|
||||||
dictionary_name_for_locale, path_to_dictionary
|
dictionary_name_for_locale, get_cache_path, path_to_dictionary
|
||||||
|
)
|
||||||
|
from calibre.utils.hyphenation.hyphenate import (
|
||||||
|
add_soft_hyphens, dictionary_for_locale, remove_punctuation
|
||||||
)
|
)
|
||||||
from calibre.utils.hyphenation.hyphenate import remove_punctuation
|
|
||||||
|
|
||||||
|
|
||||||
class TestHyphenation(unittest.TestCase):
|
class TestHyphenation(unittest.TestCase):
|
||||||
@ -23,9 +25,15 @@ class TestHyphenation(unittest.TestCase):
|
|||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
tdir = PersistentTemporaryDirectory()
|
tdir = PersistentTemporaryDirectory()
|
||||||
path_to_dictionary.cache_dir = tdir
|
path_to_dictionary.cache_dir = tdir
|
||||||
|
dictionary_name_for_locale.cache_clear()
|
||||||
|
dictionary_for_locale.cache_clear()
|
||||||
|
get_cache_path.cache_clear()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
|
dictionary_name_for_locale.cache_clear()
|
||||||
|
dictionary_for_locale.cache_clear()
|
||||||
|
get_cache_path.cache_clear()
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(path_to_dictionary.cache_dir)
|
shutil.rmtree(path_to_dictionary.cache_dir)
|
||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
@ -68,6 +76,20 @@ class TestHyphenation(unittest.TestCase):
|
|||||||
self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.'))
|
self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.'))
|
||||||
self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!'))
|
self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!'))
|
||||||
|
|
||||||
|
def test_add_soft_hyphens(self):
|
||||||
|
dictionary = dictionary_for_locale('en')
|
||||||
|
|
||||||
|
def t(word, expected):
|
||||||
|
self.ae(add_soft_hyphens(word, dictionary, '='), expected)
|
||||||
|
|
||||||
|
t('beautiful', 'beau=ti=ful')
|
||||||
|
t('beautiful.', 'beau=ti=ful.')
|
||||||
|
t('"beautiful.', '"beau=ti=ful.')
|
||||||
|
t('BeauTiful', 'Beau=Ti=ful')
|
||||||
|
|
||||||
|
dictionary = dictionary_for_locale('hu')
|
||||||
|
t('asszonnyal', 'asszonnyal')
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)
|
return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user