diff --git a/src/calibre/utils/hyphenation/hyphenate.py b/src/calibre/utils/hyphenation/hyphenate.py index 90a4ca2344..d001c7ab98 100644 --- a/src/calibre/utils/hyphenation/hyphenate.py +++ b/src/calibre/utils/hyphenation/hyphenate.py @@ -16,29 +16,6 @@ from polyglot.builtins import unicode_type from polyglot.functools import lru_cache REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.UNICODE - - -def pats(): - ans = getattr(pats, 'ans', None) - if ans is None: - pats.ans = ans = regex.compile(r'^\p{P}+', REGEX_FLAGS), regex.compile(r'\p{P}+$', REGEX_FLAGS) - return ans - - -def remove_punctuation(word): - leading, trailing = pats() - prefix = suffix = '' - nword, n = leading.subn('', word) - if n > 0: - count = len(word) - len(nword) - prefix, word = word[:count], nword - nword, n = trailing.subn('', word) - if n > 0: - count = len(word) - len(nword) - suffix, word = word[-count:], nword - return prefix, word, suffix - - hyphen = None @@ -60,7 +37,7 @@ def add_soft_hyphens(word, dictionary, hyphen_char='\u00ad'): word = unicode_type(word) if len(word) > 99 or '=' in word: return word - prefix, q, suffix = remove_punctuation(word) + q = word q = q.replace(hyphen_char, '') if len(q) < 4: return word @@ -68,7 +45,7 @@ def add_soft_hyphens(word, dictionary, hyphen_char='\u00ad'): try: ans = hyphen.simple_hyphenate(dictionary, lq) except ValueError: - # Can happen is the word requires non-standard hyphenation (i.e. + # Can happen if the word requires non-standard hyphenation (i.e. # replacements) return word parts = ans.split('=') @@ -82,4 +59,55 @@ def add_soft_hyphens(word, dictionary, hyphen_char='\u00ad'): aparts.append(q[pos:pos+lp]) pos += lp parts = aparts - return prefix + hyphen_char.join(parts) + suffix + return hyphen_char.join(parts) + + +tags_not_to_hyphenate = frozenset(( + 'video', 'audio', 'script', 'code', 'pre', 'img', 'br', 'samp', 'kbd', + 'var', 'abbr', 'acronym', 'sub', 'sup', 'button', 'option', 'label', + 'textarea', 'input', 'math', 'svg', 'style', 'title', 'head' +)) + + +def barename(x): + return x.split('}', 1)[-1] + + +def words_pat(): + ans = getattr(words_pat, 'ans', None) + if ans is None: + ans = words_pat.ans = regex.compile(r'\w+', REGEX_FLAGS) + return ans + + +def add_soft_hyphens_to_words(words, dictionary, hyphen_char='\u00ad'): + pos = 0 + parts = [] + for m in words_pat().finditer(words): + word = m.group() + if m.start() > pos: + parts.append(words[pos:m.start()]) + parts.append(add_soft_hyphens(word, dictionary, hyphen_char)) + pos = m.end() + if pos < len(words): + parts.append(words[pos:]) + return ''.join(parts) + + +def process_tag(elem, locale, hyphen_char): + name = barename(elem.tag) + if name in tags_not_to_hyphenate: + return + tl = elem.get('lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang') or locale + dictionary = dictionary_for_locale(tl) + if dictionary is not None and elem.text and not elem.text.isspace(): + elem.text = add_soft_hyphens_to_words(elem.text, dictionary, hyphen_char) + for child in elem: + if dictionary is not None and child.tail and not child.tail.isspace(): + child.tail = add_soft_hyphens_to_words(child.tail, dictionary, hyphen_char) + if not callable(getattr(child, 'tag', None)): + process_tag(child, locale, hyphen_char) + + +def add_soft_hyphens_to_html(root, locale='en', hyphen_char='\u00ad'): + process_tag(root, locale, hyphen_char) diff --git a/src/calibre/utils/hyphenation/test_hyphenation.py b/src/calibre/utils/hyphenation/test_hyphenation.py index 6dfd6cccc5..67ed1fbd69 100644 --- a/src/calibre/utils/hyphenation/test_hyphenation.py +++ b/src/calibre/utils/hyphenation/test_hyphenation.py @@ -8,12 +8,17 @@ import os import shutil import unittest +from lxml import etree + +from calibre.ebooks.oeb.polish.parsing import parse_html5 from calibre.ptempfile import PersistentTemporaryDirectory from calibre.utils.hyphenation.dictionaries import ( - dictionary_name_for_locale, get_cache_path, path_to_dictionary, is_cache_up_to_date + dictionary_name_for_locale, get_cache_path, is_cache_up_to_date, + path_to_dictionary ) from calibre.utils.hyphenation.hyphenate import ( - add_soft_hyphens, dictionary_for_locale, remove_punctuation + add_soft_hyphens, add_soft_hyphens_to_html, add_soft_hyphens_to_words, + dictionary_for_locale ) @@ -73,23 +78,29 @@ class TestHyphenation(unittest.TestCase): self.assertFalse(cache[0]) def test_add_soft_hyphens(self): - self.ae(remove_punctuation('word'), ('', 'word', '')) - self.ae(remove_punctuation('wo.rd.'), ('', 'wo.rd', '.')) - self.ae(remove_punctuation('"«word!!'), ('"«', 'word', '!!')) - - dictionary = dictionary_for_locale('en') - def t(word, expected): self.ae(add_soft_hyphens(word, dictionary, '='), expected) - t('beautiful', 'beau=ti=ful') - t('beautiful.', 'beau=ti=ful.') - t('"beautiful.', '"beau=ti=ful.') - t('BeauTiful', 'Beau=Ti=ful') - dictionary = dictionary_for_locale('hu') t('asszonnyal', 'asszonnyal') + dictionary = dictionary_for_locale('en') + t('beautiful', 'beau=ti=ful') + t('BeauTiful', 'Beau=Ti=ful') + + def w(words, expected): + self.ae(add_soft_hyphens_to_words(words, dictionary, '='), expected) + + w(' A\n beautiful day. ', ' A\n beau=ti=ful day. ') + + def test_hyphenate_html(self): + root = parse_html5(''' +

beautiful, tillata\nExpand "latitude!''', + line_numbers=False) + add_soft_hyphens_to_html(root, hyphen_char='=') + raw = etree.tostring(root, method='text', encoding='unicode') + self.ae(raw, 'beau=ti=ful, tilla=ta\nEx=pand "lat=i=tude!') + def find_tests(): return unittest.defaultTestLoader.loadTestsFromTestCase(TestHyphenation)