Ensure text fed to the FTS engine is in NFKC form

This commit is contained in:
Kovid Goyal 2021-06-19 13:58:28 +05:30
parent 52a87af143
commit 6f7454f1ad
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 12 additions and 5 deletions

View File

@ -11,6 +11,11 @@ from polyglot.builtins import itervalues
no_cfi_sort_key = cfi_sort_key('/99999999')
def unicode_normalize(text):
from unicodedata import normalize
return normalize('NFKC', text)
def bookmark_sort_key(b):
if b.get('pos_type') == 'epubcfi':
return cfi_sort_key(b['pos'], only_path=False)
@ -125,4 +130,4 @@ def annot_db_data(annot):
notes = annot.get('notes') or ''
if notes:
text += '\n\x1f\n' + notes
return aid, text
return aid, unicode_normalize(text)

View File

@ -19,7 +19,7 @@ from calibre.constants import (iswindows, filesystem_encoding,
preferred_encoding)
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
from calibre.db import SPOOL_SIZE
from calibre.db.annotations import annot_db_data
from calibre.db.annotations import annot_db_data, unicode_normalize
from calibre.db.schema_upgrades import SchemaUpgrade
from calibre.db.delete_service import delete_service
from calibre.db.errors import NoSuchFormat
@ -1801,6 +1801,7 @@ class DB(object):
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type,
restrict_to_book_ids, restrict_to_user, ignore_removed=False
):
fts_engine_query = unicode_normalize(fts_engine_query)
fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts'
text = 'annotations.searchable_text'
if highlight_start is not None and highlight_end is not None:

View File

@ -9,6 +9,7 @@ from apsw import Connection
from calibre.constants import plugins
from calibre.db.tests.base import BaseTest
from calibre.db.annotations import unicode_normalize
def print(*args, **kwargs):
@ -35,7 +36,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
return self.cursor().execute(*a)
def insert_text(self, text):
self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
self.execute('INSERT INTO fts_table(t) VALUES (?)', (unicode_normalize(text),))
def term_row_counts(self):
return dict(self.execute('SELECT term,doc FROM fts_row'))
@ -46,14 +47,14 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "", {snippet_size})'
' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK'
)
return list(self.execute(stmt, (query,)))
return list(self.execute(stmt, (unicode_normalize(query),)))
def tokenize(text, flags=None, remove_diacritics=True):
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
if flags is None:
flags = FTS5_TOKENIZE_DOCUMENT
return tokenize(text, remove_diacritics, flags)
return tokenize(unicode_normalize(text), remove_diacritics, flags)
class FTSTest(BaseTest):