mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Spell check: Convert all dictionaries to UTF-8 before loading them in hunspell
This allows adding arbitrary unicode words as user words to the dictionaries.
This commit is contained in:
parent
9b96964e3d
commit
f785c132ad
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -153,8 +153,11 @@ def get_dictionary(locale, exact_match=False):
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def load_dictionary(dictionary):
|
def load_dictionary(dictionary):
|
||||||
|
from calibre.spell.import_from import convert_to_utf8
|
||||||
with open(dictionary.dicpath, 'rb') as dic, open(dictionary.affpath, 'rb') as aff:
|
with open(dictionary.dicpath, 'rb') as dic, open(dictionary.affpath, 'rb') as aff:
|
||||||
obj = hunspell.Dictionary(dic.read(), aff.read())
|
dic_data, aff_data = dic.read(), aff.read()
|
||||||
|
dic_data, aff_data = convert_to_utf8(dic_data, aff_data)
|
||||||
|
obj = hunspell.Dictionary(dic_data, aff_data)
|
||||||
return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id)
|
return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id)
|
||||||
|
|
||||||
class Dictionaries(object):
|
class Dictionaries(object):
|
||||||
@ -402,9 +405,14 @@ def test_dictionaries():
|
|||||||
eng = parse_lang_code('en')
|
eng = parse_lang_code('en')
|
||||||
rec = partial(dictionaries.recognized, locale=eng)
|
rec = partial(dictionaries.recognized, locale=eng)
|
||||||
sg = partial(dictionaries.suggestions, locale=eng)
|
sg = partial(dictionaries.suggestions, locale=eng)
|
||||||
assert rec('recognized')
|
if not rec('recognized'):
|
||||||
assert 'adequately' in sg('ade-quately')
|
raise ValueError('recognized not recognized')
|
||||||
assert 'magic. Wand' in sg('magic.wand')
|
if 'adequately' not in sg('ade-quately'):
|
||||||
|
raise ValueError('adequately not in %s' % sg('ade-quately'))
|
||||||
|
if 'magic. Wand' not in sg('magic.wand'):
|
||||||
|
raise ValueError('magic. Wand not in: %s' % sg('magic.wand'))
|
||||||
|
d = load_dictionary(get_dictionary(parse_lang_code('es'))).obj
|
||||||
|
assert d.recognized('Achí')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test_dictionaries()
|
test_dictionaries()
|
||||||
|
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import sys, glob, os, shutil, tempfile
|
import sys, glob, os, tempfile, re, codecs
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@ -40,6 +40,21 @@ def parse_xcu(raw, origin='%origin%'):
|
|||||||
ans[(dic, aff)] = locales
|
ans[(dic, aff)] = locales
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def convert_to_utf8(dic_data, aff_data, errors='strict'):
|
||||||
|
m = re.search(br'^SET\s+(\S+)$', aff_data[:2048], flags=re.MULTILINE)
|
||||||
|
if m is not None:
|
||||||
|
enc = m.group(1)
|
||||||
|
if enc.upper() not in (b'UTF-8', b'UTF8'):
|
||||||
|
try:
|
||||||
|
codecs.lookup(enc)
|
||||||
|
except LookupError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
aff_data = aff_data[:m.start()] + b'SET UTF-8' + aff_data[m.end():]
|
||||||
|
aff_data = aff_data.decode(enc, errors).encode('utf-8')
|
||||||
|
dic_data = dic_data.decode(enc, errors).encode('utf-8')
|
||||||
|
return dic_data, aff_data
|
||||||
|
|
||||||
def import_from_libreoffice_source_tree(source_path):
|
def import_from_libreoffice_source_tree(source_path):
|
||||||
dictionaries = {}
|
dictionaries = {}
|
||||||
for x in glob.glob(os.path.join(source_path, '*', 'dictionaries.xcu')):
|
for x in glob.glob(os.path.join(source_path, '*', 'dictionaries.xcu')):
|
||||||
@ -58,9 +73,11 @@ def import_from_libreoffice_source_tree(source_path):
|
|||||||
dest = os.path.join(base, locale)
|
dest = os.path.join(base, locale)
|
||||||
if not os.path.exists(dest):
|
if not os.path.exists(dest):
|
||||||
os.makedirs(dest)
|
os.makedirs(dest)
|
||||||
for src in (dic, aff):
|
with open(dic, 'rb') as df, open(aff, 'rb') as af:
|
||||||
df = os.path.join(dest, locale + os.path.splitext(src)[1])
|
dd, ad = convert_to_utf8(df.read(), af.read())
|
||||||
shutil.copyfile(src, df)
|
for src, raw in ((dic, dd), (aff, ad)):
|
||||||
|
with open(os.path.join(dest, locale + os.path.splitext(src)[1]), 'wb') as df:
|
||||||
|
df.write(raw)
|
||||||
with open(os.path.join(dest, 'locales'), 'wb') as f:
|
with open(os.path.join(dest, 'locales'), 'wb') as f:
|
||||||
locales.sort(key=lambda x: (0, x) if x == locale else (1, x))
|
locales.sort(key=lambda x: (0, x) if x == locale else (1, x))
|
||||||
f.write(('\n'.join(locales)).encode('utf-8'))
|
f.write(('\n'.join(locales)).encode('utf-8'))
|
||||||
@ -99,10 +116,11 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
|
|||||||
metadata = [name] + list(locales)
|
metadata = [name] + list(locales)
|
||||||
with open(os.path.join(d, 'locales'), 'wb') as f:
|
with open(os.path.join(d, 'locales'), 'wb') as f:
|
||||||
f.write(('\n'.join(metadata)).encode('utf-8'))
|
f.write(('\n'.join(metadata)).encode('utf-8'))
|
||||||
|
dd, ad = convert_to_utf8(zf.open(dic).read(), zf.open(aff).read())
|
||||||
with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f:
|
with open(os.path.join(d, '%s.dic' % locales[0]), 'wb') as f:
|
||||||
shutil.copyfileobj(zf.open(dic), f)
|
f.write(dd)
|
||||||
with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f:
|
with open(os.path.join(d, '%s.aff' % locales[0]), 'wb') as f:
|
||||||
shutil.copyfileobj(zf.open(aff), f)
|
f.write(ad)
|
||||||
num += 1
|
num += 1
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
@ -54,6 +54,11 @@ def test_html5lib():
|
|||||||
from html5lib import parse # noqa
|
from html5lib import parse # noqa
|
||||||
print ('html5lib OK!')
|
print ('html5lib OK!')
|
||||||
|
|
||||||
|
def test_spell():
|
||||||
|
from calibre.spell.dictionary import test_dictionaries
|
||||||
|
test_dictionaries()
|
||||||
|
print ('hunspell OK!')
|
||||||
|
|
||||||
def test_plugins():
|
def test_plugins():
|
||||||
bad = []
|
bad = []
|
||||||
for name in plugins:
|
for name in plugins:
|
||||||
@ -233,6 +238,7 @@ def test():
|
|||||||
test_dlls()
|
test_dlls()
|
||||||
test_plugins()
|
test_plugins()
|
||||||
test_dukpy()
|
test_dukpy()
|
||||||
|
test_spell()
|
||||||
test_lxml()
|
test_lxml()
|
||||||
test_ssl()
|
test_ssl()
|
||||||
test_sqlite()
|
test_sqlite()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user