Ensure text fed to the FTS engine is in NFKC form

2026-01-06 12:10:18 -05:00 · 2021-06-19 13:58:28 +05:30 · 2021-06-19 13:58:28 +05:30 · 6f7454f1ad
commit 6f7454f1ad
parent 52a87af143
3 changed files with 12 additions and 5 deletions
--- a/src/calibre/db/annotations.py
+++ b/src/calibre/db/annotations.py
@ -11,6 +11,11 @@ from polyglot.builtins import itervalues
 no_cfi_sort_key = cfi_sort_key('/99999999')


+def unicode_normalize(text):
+    from unicodedata import normalize
+    return normalize('NFKC', text)
+
+
 def bookmark_sort_key(b):
    if b.get('pos_type') == 'epubcfi':
        return cfi_sort_key(b['pos'], only_path=False)
@ -125,4 +130,4 @@ def annot_db_data(annot):
        notes = annot.get('notes') or ''
        if notes:
            text += '\n\x1f\n' + notes
-    return aid, text
+    return aid, unicode_normalize(text)
--- a/src/calibre/db/backend.py
+++ b/src/calibre/db/backend.py
@ -19,7 +19,7 @@ from calibre.constants import (iswindows, filesystem_encoding,
        preferred_encoding)
 from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
 from calibre.db import SPOOL_SIZE
-from calibre.db.annotations import annot_db_data
+from calibre.db.annotations import annot_db_data, unicode_normalize
 from calibre.db.schema_upgrades import SchemaUpgrade
 from calibre.db.delete_service import delete_service
 from calibre.db.errors import NoSuchFormat
@ -1801,6 +1801,7 @@ class DB(object):
        fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type,
        restrict_to_book_ids, restrict_to_user, ignore_removed=False
    ):
+        fts_engine_query = unicode_normalize(fts_engine_query)
        fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts'
        text = 'annotations.searchable_text'
        if highlight_start is not None and highlight_end is not None:
--- a/src/calibre/db/tests/fts.py
+++ b/src/calibre/db/tests/fts.py
@ -9,6 +9,7 @@ from apsw import Connection

 from calibre.constants import plugins
 from calibre.db.tests.base import BaseTest
+from calibre.db.annotations import unicode_normalize


 def print(*args, **kwargs):
@ -35,7 +36,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
        return self.cursor().execute(*a)

    def insert_text(self, text):
-        self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
+        self.execute('INSERT INTO fts_table(t) VALUES (?)', (unicode_normalize(text),))

    def term_row_counts(self):
        return dict(self.execute('SELECT term,doc FROM fts_row'))
@ -46,14 +47,14 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
            f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "…", {snippet_size})'
            ' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK'
        )
-        return list(self.execute(stmt, (query,)))
+        return list(self.execute(stmt, (unicode_normalize(query),)))


 def tokenize(text, flags=None, remove_diacritics=True):
    from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
    if flags is None:
        flags = FTS5_TOKENIZE_DOCUMENT
-    return tokenize(text, remove_diacritics, flags)
+    return tokenize(unicode_normalize(text), remove_diacritics, flags)


 class FTSTest(BaseTest):