calibre/src/calibre/spell/dictionary.py

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

import os, glob, shutil, re, sys
from collections import namedtuple, defaultdict
from itertools import chain
from functools import partial

from calibre import prints
from calibre.constants import plugins, config_dir
from calibre.spell import parse_lang_code
from calibre.utils.config import JSONConfig
from calibre.utils.icu import capitalize
from calibre.utils.localization import get_lang, get_system_locale
from polyglot.builtins import iteritems, itervalues, unicode_type, filter

Dictionary = namedtuple('Dictionary', 'primary_locale locales dicpath affpath builtin name id')
LoadedDictionary = namedtuple('Dictionary', 'primary_locale locales obj builtin name id')
hunspell = plugins['hunspell'][0]
if hunspell is None:
    raise RuntimeError('Failed to load hunspell: %s' % plugins['hunspell'][1])
dprefs = JSONConfig('dictionaries/prefs.json')
dprefs.defaults['preferred_dictionaries'] = {}
dprefs.defaults['preferred_locales'] = {}
dprefs.defaults['user_dictionaries'] = [{'name':_('Default'), 'is_active':True, 'words':[]}]
not_present = object()


class UserDictionary(object):

    __slots__ = ('name', 'is_active', 'words')

    def __init__(self, **kwargs):
        self.name = kwargs['name']
        self.is_active = kwargs['is_active']
        self.words = {(w, langcode) for w, langcode in kwargs['words']}

    def serialize(self):
        return {'name':self.name, 'is_active': self.is_active, 'words':[
            (w, l) for w, l in self.words]}


_builtins = _custom = None


def builtin_dictionaries():
    global _builtins
    if _builtins is None:
        dics = []
        for lc in glob.glob(os.path.join(P('dictionaries', allow_user_override=False), '*/locales')):
            locales = list(filter(None, open(lc, 'rb').read().decode('utf-8').splitlines()))
            locale = locales[0]
            base = os.path.dirname(lc)
            dics.append(Dictionary(
                parse_lang_code(locale), frozenset(map(parse_lang_code, locales)), os.path.join(base, '%s.dic' % locale),
                os.path.join(base, '%s.aff' % locale), True, None, None))
        _builtins = frozenset(dics)
    return _builtins


def custom_dictionaries(reread=False):
    global _custom
    if _custom is None or reread:
        dics = []
        for lc in glob.glob(os.path.join(config_dir, 'dictionaries', '*/locales')):
            locales = list(filter(None, open(lc, 'rb').read().decode('utf-8').splitlines()))
            try:
                name, locale, locales = locales[0], locales[1], locales[1:]
            except IndexError:
                continue
            base = os.path.dirname(lc)
            ploc = parse_lang_code(locale)
            if ploc.countrycode is None:
                continue
            dics.append(Dictionary(
                ploc, frozenset(filter(lambda x:x.countrycode is not None, map(parse_lang_code, locales))), os.path.join(base, '%s.dic' % locale),
                os.path.join(base, '%s.aff' % locale), False, name, os.path.basename(base)))
        _custom = frozenset(dics)
    return _custom


default_en_locale = 'en-US'
try:
    ul = parse_lang_code(get_system_locale() or 'en-US')
except ValueError:
    ul = None
if ul is not None and ul.langcode == 'eng' and ul.countrycode in 'GB BS BZ GH IE IN JM NZ TT'.split():
    default_en_locale = 'en-' + ul.countrycode
default_preferred_locales = {'eng':default_en_locale, 'deu':'de-DE', 'spa':'es-ES', 'fra':'fr-FR'}


def best_locale_for_language(langcode):
    best_locale = dprefs['preferred_locales'].get(langcode, default_preferred_locales.get(langcode, None))
    if best_locale is not None:
        return parse_lang_code(best_locale)


def preferred_dictionary(locale):
    return {parse_lang_code(k):v for k, v in iteritems(dprefs['preferred_dictionaries'])}.get(locale, None)


def remove_dictionary(dictionary):
    if dictionary.builtin:
        raise ValueError('Cannot remove builtin dictionaries')
    base = os.path.dirname(dictionary.dicpath)
    shutil.rmtree(base)
    dprefs['preferred_dictionaries'] = {k:v for k, v in iteritems(dprefs['preferred_dictionaries']) if v != dictionary.id}


def rename_dictionary(dictionary, name):
    lf = os.path.join(os.path.dirname(dictionary.dicpath), 'locales')
    with open(lf, 'r+b') as f:
        lines = f.read().splitlines()
        lines[:1] = [name.encode('utf-8')]
        f.seek(0), f.truncate(), f.write(b'\n'.join(lines))
    custom_dictionaries(reread=True)


def get_dictionary(locale, exact_match=False):
    preferred = preferred_dictionary(locale)
    # First find all dictionaries that match locale exactly
    exact_matches = {}
    for collection in (custom_dictionaries(), builtin_dictionaries()):
        for d in collection:
            if d.primary_locale == locale:
                exact_matches[d.id] = d
        for d in collection:
            for q in d.locales:
                if q == locale and d.id not in exact_matches:
                    exact_matches[d.id] = d

    # If the user has specified a preferred dictionary for this locale, use it,
    # otherwise, if a builtin dictionary exists, use that
    if preferred in exact_matches:
        return exact_matches[preferred]
    # Return one of the exactly matching dictionaries, preferring user
    # installed to builtin ones
    for k in sorted(exact_matches, key=lambda x: (1, None) if x is None else (0, x)):
        return exact_matches[k]

    if exact_match:
        return

    # No dictionary matched the locale exactly, we will now fallback to
    # matching only on language. First see if a dictionary matching the
    # preferred locale for the language exists.
    best_locale = best_locale_for_language(locale.langcode)
    if best_locale is not None:
        ans = get_dictionary(best_locale, exact_match=True)
        if ans is not None:
            return ans

    # Now just return any dictionary that matches the language, preferring user
    # installed ones to builtin ones
    for collection in (custom_dictionaries(), builtin_dictionaries()):
        for d in sorted(collection, key=lambda d: d.name or ''):
            if d.primary_locale.langcode == locale.langcode:
                return d


def load_dictionary(dictionary):
    from calibre.spell.import_from import convert_to_utf8
    with open(dictionary.dicpath, 'rb') as dic, open(dictionary.affpath, 'rb') as aff:
        dic_data, aff_data = dic.read(), aff.read()
        dic_data, aff_data = convert_to_utf8(dic_data, aff_data)
        obj = hunspell.Dictionary(dic_data, aff_data)
    return LoadedDictionary(dictionary.primary_locale, dictionary.locales, obj, dictionary.builtin, dictionary.name, dictionary.id)


class Dictionaries(object):

    def __init__(self):
        self.remove_hyphenation = re.compile('[\u2010-]+')
        self.negative_pat = re.compile(r'-[.\d+]')
        self.fix_punctuation_pat = re.compile(r'''[:.]''')
        self.dictionaries = {}
        self.word_cache = {}
        self.ignored_words = set()
        self.added_user_words = {}
        try:
            self.default_locale = parse_lang_code(get_lang())
        except ValueError:
            self.default_locale = parse_lang_code('en-US')
        self.ui_locale = self.default_locale

    def initialize(self, force=False):
        if force or not hasattr(self, 'active_user_dictionaries'):
            self.read_user_dictionaries()

    def clear_caches(self):
        self.dictionaries.clear(), self.word_cache.clear()

    def clear_ignored(self):
        self.ignored_words.clear()

    def dictionary_for_locale(self, locale):
        ans = self.dictionaries.get(locale, not_present)
        if ans is not_present:
            ans = get_dictionary(locale)
            if ans is not None:
                ans = load_dictionary(ans)
                for ud in self.active_user_dictionaries:
                    for word, langcode in ud.words:
                        if langcode == locale.langcode:
                            try:
                                ans.obj.add(word)
                            except Exception:
                                # not critical since all it means is that the word wont show up in suggestions
                                prints('Failed to add the word %r to the dictionary for %s' % (word, locale), file=sys.stderr)
            self.dictionaries[locale] = ans
        return ans

    def ignore_word(self, word, locale):
        self.ignored_words.add((word, locale.langcode))
        self.word_cache[(word, locale)] = True

    def unignore_word(self, word, locale):
        self.ignored_words.discard((word, locale.langcode))
        self.word_cache.pop((word, locale), None)

    def is_word_ignored(self, word, locale):
        return (word, locale.langcode) in self.ignored_words

    @property
    def all_user_dictionaries(self):
        return chain(self.active_user_dictionaries, self.inactive_user_dictionaries)

    def user_dictionary(self, name):
        for ud in self.all_user_dictionaries:
            if ud.name == name:
                return ud

    def read_user_dictionaries(self):
        self.active_user_dictionaries = []
        self.inactive_user_dictionaries = []
        for d in dprefs['user_dictionaries'] or dprefs.defaults['user_dictionaries']:
            d = UserDictionary(**d)
            (self.active_user_dictionaries if d.is_active else self.inactive_user_dictionaries).append(d)

    def mark_user_dictionary_as_active(self, name, is_active=True):
        d = self.user_dictionary(name)
        if d is not None:
            d.is_active = is_active
            self.save_user_dictionaries()
            return True
        return False

    def save_user_dictionaries(self):
        dprefs['user_dictionaries'] = [d.serialize() for d in self.all_user_dictionaries]

    def add_user_words(self, words, langcode):
        for d in itervalues(self.dictionaries):
            if d and getattr(d.primary_locale, 'langcode', None) == langcode:
                for word in words:
                    d.obj.add(word)

    def remove_user_words(self, words, langcode):
        for d in itervalues(self.dictionaries):
            if d and d.primary_locale.langcode == langcode:
                for word in words:
                    d.obj.remove(word)

    def add_to_user_dictionary(self, name, word, locale):
        ud = self.user_dictionary(name)
        if ud is None:
            raise ValueError('Cannot add to the dictionary named: %s as no such dictionary exists' % name)
        wl = len(ud.words)
        if isinstance(word, (set, frozenset)):
            ud.words |= word
            self.add_user_words(word, locale.langcode)
        else:
            ud.words.add((word, locale.langcode))
            self.add_user_words((word,), locale.langcode)
        if len(ud.words) > wl:
            self.save_user_dictionaries()
            try:
                self.word_cache.pop((word, locale), None)
            except TypeError:
                pass  # word is a set, ignore
            return True
        return False

    def remove_from_user_dictionaries(self, word, locale):
        key = (word, locale.langcode)
        changed = False
        for ud in self.active_user_dictionaries:
            if key in ud.words:
                changed = True
                ud.words.discard(key)
        if changed:
            self.word_cache.pop((word, locale), None)
            self.save_user_dictionaries()
            self.remove_user_words((word,), locale.langcode)
        return changed

    def remove_from_user_dictionary(self, name, words):
        changed = False
        removals = defaultdict(set)
        keys = [(w, l.langcode) for w, l in words]
        for d in self.all_user_dictionaries:
            if d.name == name:
                for key in keys:
                    if key in d.words:
                        d.words.discard(key)
                        removals[key[1]].add(key[0])
                        changed = True
        if changed:
            for key in words:
                self.word_cache.pop(key, None)
            for langcode, words in iteritems(removals):
                self.remove_user_words(words, langcode)
            self.save_user_dictionaries()
        return changed

    def word_in_user_dictionary(self, word, locale):
        key = (word, locale.langcode)
        for ud in self.active_user_dictionaries:
            if key in ud.words:
                return ud.name

    def create_user_dictionary(self, name):
        if name in {d.name for d in self.all_user_dictionaries}:
            raise ValueError('A dictionary named %s already exists' % name)
        d = UserDictionary(name=name, is_active=True, words=())
        self.active_user_dictionaries.append(d)
        self.save_user_dictionaries()

    def remove_user_dictionary(self, name):
        changed = False
        for x in (self.active_user_dictionaries, self.inactive_user_dictionaries):
            for d in tuple(x):
                if d.name == name:
                    x.remove(d)
                    changed = True
        if changed:
            self.save_user_dictionaries()
            self.clear_caches()
        return changed

    def rename_user_dictionary(self, name, new_name):
        changed = False
        for d in self.all_user_dictionaries:
            if d.name == name:
                d.name = new_name
                changed = True
        if changed:
            self.save_user_dictionaries()
        return changed

    def recognized(self, word, locale=None):
        locale = locale or self.default_locale
        key = (word, locale)
        ans = self.word_cache.get(key, None)
        if ans is None:
            lkey = (word, locale.langcode)
            ans = False
            if lkey in self.ignored_words:
                ans = True
            else:
                for ud in self.active_user_dictionaries:
                    if lkey in ud.words:
                        ans = True
                        break
                else:
                    d = self.dictionary_for_locale(locale)
                    if d is not None:
                        try:
                            ans = d.obj.recognized(word.replace('\u2010', '-'))
                        except ValueError:
                            pass
                    else:
                        ans = True
            if ans is False and self.negative_pat.match(word) is not None:
                ans = True
            self.word_cache[key] = ans
        return ans

    def suggestions(self, word, locale=None):
        locale = locale or self.default_locale
        d = self.dictionary_for_locale(locale)
        has_unicode_hyphen = '\u2010' in word
        ans = ()

        def add_suggestion(w, ans):
            return (w,) + tuple(x for x in ans if x != w)

        if d is not None:
            try:
                ans = d.obj.suggest(unicode_type(word).replace('\u2010', '-'))
            except ValueError:
                pass
            else:
                dehyphenated_word = self.remove_hyphenation.sub('', word)
                if len(dehyphenated_word) != len(word) and self.recognized(dehyphenated_word, locale):
                    # Ensure the de-hyphenated word is present and is the first suggestion
                    ans = add_suggestion(dehyphenated_word, ans)
                else:
                    m = self.fix_punctuation_pat.search(word)
                    if m is not None:
                        w1, w2 = word[:m.start()], word[m.end():]
                        if self.recognized(w1) and self.recognized(w2):
                            fw = w1 + m.group() + ' ' + w2
                            ans = add_suggestion(fw, ans)
                            if capitalize(w2) != w2:
                                fw = w1 + m.group() + ' ' + capitalize(w2)
                                ans = add_suggestion(fw, ans)

        if has_unicode_hyphen:
            ans = tuple(w.replace('-', '\u2010') for w in ans)
        return ans


def find_tests():
    import unittest

    class TestDictionaries(unittest.TestCase):

        def setUp(self):
            dictionaries = Dictionaries()
            dictionaries.initialize()
            eng = parse_lang_code('en')
            self.recognized = partial(dictionaries.recognized, locale=eng)
            self.suggestions = partial(dictionaries.suggestions, locale=eng)

        def ar(self, w):
            if not self.recognized(w):
                raise AssertionError('The word %r was not recognized' % w)

        def test_dictionaries(self):
            for w in 'recognized one-half one\u2010half'.split():
                self.ar(w)
            d = load_dictionary(get_dictionary(parse_lang_code('es'))).obj
            self.assertTrue(d.recognized('Achí'))
            self.assertIn('one\u2010half', self.suggestions('oone\u2010half'))
            self.assertIn('adequately', self.suggestions('ade-quately'))
            self.assertIn('magic. Wand', self.suggestions('magic.wand'))

    return unittest.TestLoader().loadTestsFromTestCase(TestDictionaries)