mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Spell check: Convert all dictionaries to UTF-8 before loading them in hunspell
This allows adding arbitrary unicode words as user words to the dictionaries.
This commit is contained in:
parent
9b96964e3d
commit
f785c132ad
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -153,8 +153,11 @@ def get_dictionary(locale, exact_match=False):
|
||||
return d
|
||||
|
||||
def load_dictionary(dictionary):
|
||||
from calibre.spell.import_from import convert_to_utf8
|
||||
with open(dictionary.dicpath, 'rb') as dic, open(dictionary.affpath, 'rb') as aff:
|
||||
obj = hunspell.Dictionary(dic.read(), aff.read())
|
||||
dic_data, aff_data = dic.read(), aff.read()
|
||||
dic_data, aff_data = convert_to_utf8(dic_data, aff_data)
|
||||
obj = hunspell.Dictionary(dic_data, aff_data)
|
||||
return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id)
|
||||
|
||||
class Dictionaries(object):
|
||||
@ -402,9 +405,14 @@ def test_dictionaries():
|
||||
eng = parse_lang_code('en')
|
||||
rec = partial(dictionaries.recognized, locale=eng)
|
||||
sg = partial(dictionaries.suggestions, locale=eng)
|
||||
assert rec('recognized')
|
||||
assert 'adequately' in sg('ade-quately')
|
||||
assert 'magic. Wand' in sg('magic.wand')
|
||||
if not rec('recognized'):
|
||||
raise ValueError('recognized not recognized')
|
||||
if 'adequately' not in sg('ade-quately'):
|
||||
raise ValueError('adequately not in %s' % sg('ade-quately'))
|
||||
if 'magic. Wand' not in sg('magic.wand'):
|
||||
raise ValueError('magic. Wand not in: %s' % sg('magic.wand'))
|
||||
d = load_dictionary(get_dictionary(parse_lang_code('es'))).obj
|
||||
assert d.recognized('Achí')
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_dictionaries()
|
||||
|
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import sys, glob, os, shutil, tempfile
|
||||
import sys, glob, os, tempfile, re, codecs
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@ -40,6 +40,21 @@ def parse_xcu(raw, origin='%origin%'):
|
||||
ans[(dic, aff)] = locales
|
||||
return ans
|
||||
|
||||
def convert_to_utf8(dic_data, aff_data, errors='strict'):
|
||||
m = re.search(br'^SET\s+(\S+)$', aff_data[:2048], flags=re.MULTILINE)
|
||||
if m is not None:
|
||||
enc = m.group(1)
|
||||
if enc.upper() not in (b'UTF-8', b'UTF8'):
|
||||
try:
|
||||
codecs.lookup(enc)
|
||||
except LookupError:
|
||||
pass
|
||||
else:
|
||||
aff_data = aff_data[:m.start()] + b'SET UTF-8' + aff_data[m.end():]
|
||||
aff_data = aff_data.decode(enc, errors).encode('utf-8')
|
||||
dic_data = dic_data.decode(enc, errors).encode('utf-8')
|
||||
return dic_data, aff_data
|
||||
|
||||
def import_from_libreoffice_source_tree(source_path):
|
||||
dictionaries = {}
|
||||
for x in glob.glob(os.path.join(source_path, '*', 'dictionaries.xcu')):
|
||||
@ -58,9 +73,11 @@ def import_from_libreoffice_source_tree(source_path):
|
||||
dest = os.path.join(base, locale)
|
||||
if not os.path.exists(dest):
|
||||
os.makedirs(dest)
|
||||
for src in (dic, aff):
|
||||
df = os.path.join(dest, locale + os.path.splitext(src)[1])
|
||||
shutil.copyfile(src, df)
|
||||
with open(dic, 'rb') as df, open(aff, 'rb') as af:
|
||||
dd, ad = convert_to_utf8(df.read(), af.read())
|
||||
for src, raw in ((dic, dd), (aff, ad)):
|
||||
with open(os.path.join(dest, locale + os.path.splitext(src)[1]), 'wb') as df:
|
||||
df.write(raw)
|
||||
with open(os.path.join(dest, 'locales'), 'wb') as f:
|
||||
locales.sort(key=lambda x: (0, x) if x == locale else (1, x))
|
||||
f.write(('\n'.join(locales)).encode('utf-8'))
|
||||
@ -99,10 +116,11 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
|
||||
metadata = [name] + list(locales)
|
||||
with open(os.path.join(d, 'locales'), 'wb') as f:
|
||||
f.write(('\n'.join(metadata)).encode('utf-8'))
|
||||
dd, ad = convert_to_utf8(zf.open(dic).read(), zf.open(aff).read())
|
||||
with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f:
|
||||
shutil.copyfileobj(zf.open(dic), f)
|
||||
f.write(dd)
|
||||
with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f:
|
||||
shutil.copyfileobj(zf.open(aff), f)
|
||||
f.write(ad)
|
||||
num += 1
|
||||
return num
|
||||
|
||||
|
@ -54,6 +54,11 @@ def test_html5lib():
|
||||
from html5lib import parse # noqa
|
||||
print ('html5lib OK!')
|
||||
|
||||
def test_spell():
|
||||
from calibre.spell.dictionary import test_dictionaries
|
||||
test_dictionaries()
|
||||
print ('hunspell OK!')
|
||||
|
||||
def test_plugins():
|
||||
bad = []
|
||||
for name in plugins:
|
||||
@ -233,6 +238,7 @@ def test():
|
||||
test_dlls()
|
||||
test_plugins()
|
||||
test_dukpy()
|
||||
test_spell()
|
||||
test_lxml()
|
||||
test_ssl()
|
||||
test_sqlite()
|
||||
|
Loading…
x
Reference in New Issue
Block a user