Spell check: Convert all dictionaries to UTF-8 before loading them in hunspell

This allows adding arbitrary unicode words as user words to the dictionaries.
2025-07-09 03:04:10 -04:00 · 2015-07-23 00:17:21 +05:30 · 2015-07-23 00:17:21 +05:30 · f785c132ad
commit f785c132ad
parent 9b96964e3d
5 changed files with 16803 additions and 16771 deletions
--- a/resources/dictionaries/es-ES/es-ES.aff
+++ b/resources/dictionaries/es-ES/es-ES.aff
--- a/resources/dictionaries/es-ES/es-ES.dic
+++ b/resources/dictionaries/es-ES/es-ES.dic
--- a/src/calibre/spell/dictionary.py
+++ b/src/calibre/spell/dictionary.py
@ -153,8 +153,11 @@ def get_dictionary(locale, exact_match=False):
                return d

 def load_dictionary(dictionary):
+    from calibre.spell.import_from import convert_to_utf8
    with open(dictionary.dicpath, 'rb') as dic, open(dictionary.affpath, 'rb') as aff:
-        obj = hunspell.Dictionary(dic.read(), aff.read())
+        dic_data, aff_data = dic.read(), aff.read()
+        dic_data, aff_data = convert_to_utf8(dic_data, aff_data)
+        obj = hunspell.Dictionary(dic_data, aff_data)
    return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id)

 class Dictionaries(object):
@ -402,9 +405,14 @@ def test_dictionaries():
    eng = parse_lang_code('en')
    rec = partial(dictionaries.recognized, locale=eng)
    sg = partial(dictionaries.suggestions, locale=eng)
-    assert rec('recognized')
-    assert 'adequately' in sg('ade-quately')
-    assert 'magic. Wand' in sg('magic.wand')
+    if not rec('recognized'):
+        raise ValueError('recognized not recognized')
+    if 'adequately' not in sg('ade-quately'):
+        raise ValueError('adequately not in %s' % sg('ade-quately'))
+    if 'magic. Wand' not in sg('magic.wand'):
+        raise ValueError('magic. Wand not in: %s' % sg('magic.wand'))
+    d = load_dictionary(get_dictionary(parse_lang_code('es'))).obj
+    assert d.recognized('Achí')

 if __name__ == '__main__':
    test_dictionaries()
--- a/src/calibre/spell/import_from.py
+++ b/src/calibre/spell/import_from.py
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

-import sys, glob, os, shutil, tempfile
+import sys, glob, os, tempfile, re, codecs

 from lxml import etree

@ -40,6 +40,21 @@ def parse_xcu(raw, origin='%origin%'):
        ans[(dic, aff)] = locales
    return ans

+def convert_to_utf8(dic_data, aff_data, errors='strict'):
+    m = re.search(br'^SET\s+(\S+)$', aff_data[:2048], flags=re.MULTILINE)
+    if m is not None:
+        enc = m.group(1)
+        if enc.upper() not in (b'UTF-8', b'UTF8'):
+            try:
+                codecs.lookup(enc)
+            except LookupError:
+                pass
+            else:
+                aff_data = aff_data[:m.start()] + b'SET UTF-8' + aff_data[m.end():]
+                aff_data = aff_data.decode(enc, errors).encode('utf-8')
+                dic_data = dic_data.decode(enc, errors).encode('utf-8')
+    return dic_data, aff_data
+
 def import_from_libreoffice_source_tree(source_path):
    dictionaries = {}
    for x in glob.glob(os.path.join(source_path, '*', 'dictionaries.xcu')):
@ -58,9 +73,11 @@ def import_from_libreoffice_source_tree(source_path):
            dest = os.path.join(base, locale)
            if not os.path.exists(dest):
                os.makedirs(dest)
-            for src in (dic, aff):
-                df = os.path.join(dest, locale + os.path.splitext(src)[1])
-                shutil.copyfile(src, df)
+            with open(dic, 'rb') as df, open(aff, 'rb') as af:
+                dd, ad = convert_to_utf8(df.read(), af.read())
+            for src, raw in ((dic, dd), (aff, ad)):
+                with open(os.path.join(dest, locale + os.path.splitext(src)[1]), 'wb') as df:
+                    df.write(raw)
            with open(os.path.join(dest, 'locales'), 'wb') as f:
                locales.sort(key=lambda x: (0, x) if x == locale else (1, x))
                f.write(('\n'.join(locales)).encode('utf-8'))
@ -99,10 +116,11 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
            metadata = [name] + list(locales)
            with open(os.path.join(d, 'locales'), 'wb') as f:
                f.write(('\n'.join(metadata)).encode('utf-8'))
+            dd, ad = convert_to_utf8(zf.open(dic).read(), zf.open(aff).read())
            with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f:
-                shutil.copyfileobj(zf.open(dic), f)
+                f.write(dd)
            with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f:
-                shutil.copyfileobj(zf.open(aff), f)
+                f.write(ad)
            num += 1
    return num

--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@ -54,6 +54,11 @@ def test_html5lib():
    from html5lib import parse  # noqa
    print ('html5lib OK!')

+def test_spell():
+    from calibre.spell.dictionary import test_dictionaries
+    test_dictionaries()
+    print ('hunspell OK!')
+
 def test_plugins():
    bad = []
    for name in plugins:
@ -233,6 +238,7 @@ def test():
        test_dlls()
    test_plugins()
    test_dukpy()
+    test_spell()
    test_lxml()
    test_ssl()
    test_sqlite()