diff --git a/src/calibre/db/annotations.py b/src/calibre/db/annotations.py index 1168c80e71..fa695e3075 100644 --- a/src/calibre/db/annotations.py +++ b/src/calibre/db/annotations.py @@ -11,6 +11,11 @@ from polyglot.builtins import itervalues no_cfi_sort_key = cfi_sort_key('/99999999') +def unicode_normalize(text): + from unicodedata import normalize + return normalize('NFKC', text) + + def bookmark_sort_key(b): if b.get('pos_type') == 'epubcfi': return cfi_sort_key(b['pos'], only_path=False) @@ -125,4 +130,4 @@ def annot_db_data(annot): notes = annot.get('notes') or '' if notes: text += '\n\x1f\n' + notes - return aid, text + return aid, unicode_normalize(text) diff --git a/src/calibre/db/backend.py b/src/calibre/db/backend.py index 6527b6e817..a108b6d173 100644 --- a/src/calibre/db/backend.py +++ b/src/calibre/db/backend.py @@ -19,7 +19,7 @@ from calibre.constants import (iswindows, filesystem_encoding, preferred_encoding) from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile from calibre.db import SPOOL_SIZE -from calibre.db.annotations import annot_db_data +from calibre.db.annotations import annot_db_data, unicode_normalize from calibre.db.schema_upgrades import SchemaUpgrade from calibre.db.delete_service import delete_service from calibre.db.errors import NoSuchFormat @@ -1801,6 +1801,7 @@ class DB(object): fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type, restrict_to_book_ids, restrict_to_user, ignore_removed=False ): + fts_engine_query = unicode_normalize(fts_engine_query) fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts' text = 'annotations.searchable_text' if highlight_start is not None and highlight_end is not None: diff --git a/src/calibre/db/tests/fts.py b/src/calibre/db/tests/fts.py index e1874444c5..8f49afe96b 100644 --- a/src/calibre/db/tests/fts.py +++ b/src/calibre/db/tests/fts.py @@ -9,6 +9,7 @@ from apsw import Connection from calibre.constants import plugins from calibre.db.tests.base import BaseTest +from calibre.db.annotations import unicode_normalize def print(*args, **kwargs): @@ -35,7 +36,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row); return self.cursor().execute(*a) def insert_text(self, text): - self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,)) + self.execute('INSERT INTO fts_table(t) VALUES (?)', (unicode_normalize(text),)) def term_row_counts(self): return dict(self.execute('SELECT term,doc FROM fts_row')) @@ -46,14 +47,14 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row); f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "…", {snippet_size})' ' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK' ) - return list(self.execute(stmt, (query,))) + return list(self.execute(stmt, (unicode_normalize(query),))) def tokenize(text, flags=None, remove_diacritics=True): from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT if flags is None: flags = FTS5_TOKENIZE_DOCUMENT - return tokenize(text, remove_diacritics, flags) + return tokenize(unicode_normalize(text), remove_diacritics, flags) class FTSTest(BaseTest):