diff --git a/src/calibre/gui2/tweak_book/char_select.py b/src/calibre/gui2/tweak_book/char_select.py index b4c1eeeab9..97b2cb46f1 100644 --- a/src/calibre/gui2/tweak_book/char_select.py +++ b/src/calibre/gui2/tweak_book/char_select.py @@ -6,10 +6,9 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -import unicodedata, re, os, cPickle, textwrap +import re, textwrap from bisect import bisect from functools import partial -from collections import defaultdict from PyQt5.Qt import ( QAbstractItemModel, QModelIndex, Qt, pyqtSignal, QApplication, @@ -17,12 +16,12 @@ from PyQt5.Qt import ( QStyledItemDelegate, QSplitter, QLabel, QSizePolicy, QIcon, QMimeData, QPushButton, QToolButton, QInputMethodEvent) -from calibre.constants import plugins, cache_dir +from calibre.constants import plugins from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.tweak_book import tprefs from calibre.gui2.tweak_book.widgets import Dialog, BusyCursor -from calibre.utils.icu import safe_chr as chr, icu_unicode_version -from calibre.utils.unicode_names import character_name_from_code +from calibre.utils.icu import safe_chr as chr +from calibre.utils.unicode_names import character_name_from_code, points_for_word ROOT = QModelIndex() @@ -34,44 +33,9 @@ non_printing = { 0x206e: 'nads', 0x206f: 'nods', 0x20: 'sp', 0x7f: 'del', 0x2e3a: '2m', 0x2e3b: '3m', 0xad: 'shy', } + # Searching {{{ - - -def load_search_index(): - topchar = 0x10ffff - ver = (1, topchar, icu_unicode_version or unicodedata.unidata_version) # Increment this when you make any changes to the index - name_map = {} - path = os.path.join(cache_dir(), 'unicode-name-index.pickle') - if os.path.exists(path): - with open(path, 'rb') as f: - name_map = cPickle.load(f) - if name_map.pop('calibre-nm-version:', None) != ver: - name_map = {} - if not name_map: - name_map = defaultdict(set) - for x in xrange(1, topchar + 1): - for word in character_name_from_code(x).split(): - name_map[word.lower()].add(x) - from calibre.ebooks.html_entities import html5_entities - for name, char in html5_entities.iteritems(): - try: - name_map[name.lower()].add(ord(char)) - except TypeError: - continue - name_map['nnbsp'].add(0x202F) - name_map['calibre-nm-version:'] = ver - cPickle.dump(dict(name_map), open(path, 'wb'), -1) - del name_map['calibre-nm-version:'] - return name_map - - -_index = None - - def search_for_chars(query, and_tokens=False): - global _index - if _index is None: - _index = load_search_index() ans = set() for token in query.split(): token = token.lower() @@ -79,7 +43,7 @@ def search_for_chars(query, and_tokens=False): if m is not None: chars = {int(m.group(1), 16)} else: - chars = _index.get(token, None) + chars = points_for_word(token) if chars is not None: if and_tokens: ans &= chars diff --git a/src/calibre/utils/unicode_names.py b/src/calibre/utils/unicode_names.py index ea9d44ce27..7b5633146f 100644 --- a/src/calibre/utils/unicode_names.py +++ b/src/calibre/utils/unicode_names.py @@ -2,11 +2,39 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2018, Kovid Goyal -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals + +from collections import defaultdict from calibre.constants import plugins +from calibre.utils.icu import ord_string def character_name_from_code(code): return plugins['unicode_names'][0].name_for_codepoint(code) or 'U+{:X}'.format(code) + + +def html_entities(): + ans = getattr(html_entities, 'ans', None) + if ans is None: + from calibre.ebooks.html_entities import html5_entities + ans = defaultdict(set) + for name, char in html5_entities.iteritems(): + try: + ans[name.lower()].add(ord_string(char)[0]) + except TypeError: + continue + ans['nnbsp'].add(0x202F) + ans = dict(ans) + html_entities.ans = ans + return ans + + +def points_for_word(w): + w = w.lower() + ans = points_for_word.cache.get(w) + if ans is None: + ans = plugins['unicode_names'][0].codepoints_for_word(w.encode('utf-8')) | html_entities().get(w, set()) + points_for_word.cache[w] = ans + return ans +points_for_word.cache = {} # noqa