Ensure text fed to the FTS engine is in NFKC form

This commit is contained in:
Kovid Goyal 2021-06-19 13:58:28 +05:30
parent 52a87af143
commit 6f7454f1ad
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 12 additions and 5 deletions

View File

@ -11,6 +11,11 @@ from polyglot.builtins import itervalues
no_cfi_sort_key = cfi_sort_key('/99999999') no_cfi_sort_key = cfi_sort_key('/99999999')
def unicode_normalize(text):
from unicodedata import normalize
return normalize('NFKC', text)
def bookmark_sort_key(b): def bookmark_sort_key(b):
if b.get('pos_type') == 'epubcfi': if b.get('pos_type') == 'epubcfi':
return cfi_sort_key(b['pos'], only_path=False) return cfi_sort_key(b['pos'], only_path=False)
@ -125,4 +130,4 @@ def annot_db_data(annot):
notes = annot.get('notes') or '' notes = annot.get('notes') or ''
if notes: if notes:
text += '\n\x1f\n' + notes text += '\n\x1f\n' + notes
return aid, text return aid, unicode_normalize(text)

View File

@ -19,7 +19,7 @@ from calibre.constants import (iswindows, filesystem_encoding,
preferred_encoding) preferred_encoding)
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
from calibre.db import SPOOL_SIZE from calibre.db import SPOOL_SIZE
from calibre.db.annotations import annot_db_data from calibre.db.annotations import annot_db_data, unicode_normalize
from calibre.db.schema_upgrades import SchemaUpgrade from calibre.db.schema_upgrades import SchemaUpgrade
from calibre.db.delete_service import delete_service from calibre.db.delete_service import delete_service
from calibre.db.errors import NoSuchFormat from calibre.db.errors import NoSuchFormat
@ -1801,6 +1801,7 @@ class DB(object):
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type, fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type,
restrict_to_book_ids, restrict_to_user, ignore_removed=False restrict_to_book_ids, restrict_to_user, ignore_removed=False
): ):
fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts' fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts'
text = 'annotations.searchable_text' text = 'annotations.searchable_text'
if highlight_start is not None and highlight_end is not None: if highlight_start is not None and highlight_end is not None:

View File

@ -9,6 +9,7 @@ from apsw import Connection
from calibre.constants import plugins from calibre.constants import plugins
from calibre.db.tests.base import BaseTest from calibre.db.tests.base import BaseTest
from calibre.db.annotations import unicode_normalize
def print(*args, **kwargs): def print(*args, **kwargs):
@ -35,7 +36,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
return self.cursor().execute(*a) return self.cursor().execute(*a)
def insert_text(self, text): def insert_text(self, text):
self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,)) self.execute('INSERT INTO fts_table(t) VALUES (?)', (unicode_normalize(text),))
def term_row_counts(self): def term_row_counts(self):
return dict(self.execute('SELECT term,doc FROM fts_row')) return dict(self.execute('SELECT term,doc FROM fts_row'))
@ -46,14 +47,14 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "", {snippet_size})' f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "", {snippet_size})'
' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK' ' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK'
) )
return list(self.execute(stmt, (query,))) return list(self.execute(stmt, (unicode_normalize(query),)))
def tokenize(text, flags=None, remove_diacritics=True): def tokenize(text, flags=None, remove_diacritics=True):
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
if flags is None: if flags is None:
flags = FTS5_TOKENIZE_DOCUMENT flags = FTS5_TOKENIZE_DOCUMENT
return tokenize(text, remove_diacritics, flags) return tokenize(unicode_normalize(text), remove_diacritics, flags)
class FTSTest(BaseTest): class FTSTest(BaseTest):