Spell check: Convert all dictionaries to UTF-8 before loading them in hunspell

This allows adding arbitrary unicode words as user words to the
dictionaries.
This commit is contained in:
Kovid Goyal 2015-07-23 00:17:21 +05:30
parent 9b96964e3d
commit f785c132ad
5 changed files with 16803 additions and 16771 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -153,8 +153,11 @@ def get_dictionary(locale, exact_match=False):
return d
def load_dictionary(dictionary):
from calibre.spell.import_from import convert_to_utf8
with open(dictionary.dicpath, 'rb') as dic, open(dictionary.affpath, 'rb') as aff:
obj = hunspell.Dictionary(dic.read(), aff.read())
dic_data, aff_data = dic.read(), aff.read()
dic_data, aff_data = convert_to_utf8(dic_data, aff_data)
obj = hunspell.Dictionary(dic_data, aff_data)
return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id)
class Dictionaries(object):
@ -402,9 +405,14 @@ def test_dictionaries():
eng = parse_lang_code('en')
rec = partial(dictionaries.recognized, locale=eng)
sg = partial(dictionaries.suggestions, locale=eng)
assert rec('recognized')
assert 'adequately' in sg('ade-quately')
assert 'magic. Wand' in sg('magic.wand')
if not rec('recognized'):
raise ValueError('recognized not recognized')
if 'adequately' not in sg('ade-quately'):
raise ValueError('adequately not in %s' % sg('ade-quately'))
if 'magic. Wand' not in sg('magic.wand'):
raise ValueError('magic. Wand not in: %s' % sg('magic.wand'))
d = load_dictionary(get_dictionary(parse_lang_code('es'))).obj
assert d.recognized('Achí')
if __name__ == '__main__':
test_dictionaries()

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, glob, os, shutil, tempfile
import sys, glob, os, tempfile, re, codecs
from lxml import etree
@ -40,6 +40,21 @@ def parse_xcu(raw, origin='%origin%'):
ans[(dic, aff)] = locales
return ans
def convert_to_utf8(dic_data, aff_data, errors='strict'):
m = re.search(br'^SET\s+(\S+)$', aff_data[:2048], flags=re.MULTILINE)
if m is not None:
enc = m.group(1)
if enc.upper() not in (b'UTF-8', b'UTF8'):
try:
codecs.lookup(enc)
except LookupError:
pass
else:
aff_data = aff_data[:m.start()] + b'SET UTF-8' + aff_data[m.end():]
aff_data = aff_data.decode(enc, errors).encode('utf-8')
dic_data = dic_data.decode(enc, errors).encode('utf-8')
return dic_data, aff_data
def import_from_libreoffice_source_tree(source_path):
dictionaries = {}
for x in glob.glob(os.path.join(source_path, '*', 'dictionaries.xcu')):
@ -58,9 +73,11 @@ def import_from_libreoffice_source_tree(source_path):
dest = os.path.join(base, locale)
if not os.path.exists(dest):
os.makedirs(dest)
for src in (dic, aff):
df = os.path.join(dest, locale + os.path.splitext(src)[1])
shutil.copyfile(src, df)
with open(dic, 'rb') as df, open(aff, 'rb') as af:
dd, ad = convert_to_utf8(df.read(), af.read())
for src, raw in ((dic, dd), (aff, ad)):
with open(os.path.join(dest, locale + os.path.splitext(src)[1]), 'wb') as df:
df.write(raw)
with open(os.path.join(dest, 'locales'), 'wb') as f:
locales.sort(key=lambda x: (0, x) if x == locale else (1, x))
f.write(('\n'.join(locales)).encode('utf-8'))
@ -99,10 +116,11 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
metadata = [name] + list(locales)
with open(os.path.join(d, 'locales'), 'wb') as f:
f.write(('\n'.join(metadata)).encode('utf-8'))
dd, ad = convert_to_utf8(zf.open(dic).read(), zf.open(aff).read())
with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f:
shutil.copyfileobj(zf.open(dic), f)
f.write(dd)
with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f:
shutil.copyfileobj(zf.open(aff), f)
f.write(ad)
num += 1
return num

View File

@ -54,6 +54,11 @@ def test_html5lib():
from html5lib import parse # noqa
print ('html5lib OK!')
def test_spell():
from calibre.spell.dictionary import test_dictionaries
test_dictionaries()
print ('hunspell OK!')
def test_plugins():
bad = []
for name in plugins:
@ -233,6 +238,7 @@ def test():
test_dlls()
test_plugins()
test_dukpy()
test_spell()
test_lxml()
test_ssl()
test_sqlite()