mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Ensure text fed to the FTS engine is in NFKC form
This commit is contained in:
parent
52a87af143
commit
6f7454f1ad
@ -11,6 +11,11 @@ from polyglot.builtins import itervalues
|
||||
no_cfi_sort_key = cfi_sort_key('/99999999')
|
||||
|
||||
|
||||
def unicode_normalize(text):
|
||||
from unicodedata import normalize
|
||||
return normalize('NFKC', text)
|
||||
|
||||
|
||||
def bookmark_sort_key(b):
|
||||
if b.get('pos_type') == 'epubcfi':
|
||||
return cfi_sort_key(b['pos'], only_path=False)
|
||||
@ -125,4 +130,4 @@ def annot_db_data(annot):
|
||||
notes = annot.get('notes') or ''
|
||||
if notes:
|
||||
text += '\n\x1f\n' + notes
|
||||
return aid, text
|
||||
return aid, unicode_normalize(text)
|
||||
|
@ -19,7 +19,7 @@ from calibre.constants import (iswindows, filesystem_encoding,
|
||||
preferred_encoding)
|
||||
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
|
||||
from calibre.db import SPOOL_SIZE
|
||||
from calibre.db.annotations import annot_db_data
|
||||
from calibre.db.annotations import annot_db_data, unicode_normalize
|
||||
from calibre.db.schema_upgrades import SchemaUpgrade
|
||||
from calibre.db.delete_service import delete_service
|
||||
from calibre.db.errors import NoSuchFormat
|
||||
@ -1801,6 +1801,7 @@ class DB(object):
|
||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type,
|
||||
restrict_to_book_ids, restrict_to_user, ignore_removed=False
|
||||
):
|
||||
fts_engine_query = unicode_normalize(fts_engine_query)
|
||||
fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts'
|
||||
text = 'annotations.searchable_text'
|
||||
if highlight_start is not None and highlight_end is not None:
|
||||
|
@ -9,6 +9,7 @@ from apsw import Connection
|
||||
|
||||
from calibre.constants import plugins
|
||||
from calibre.db.tests.base import BaseTest
|
||||
from calibre.db.annotations import unicode_normalize
|
||||
|
||||
|
||||
def print(*args, **kwargs):
|
||||
@ -35,7 +36,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
||||
return self.cursor().execute(*a)
|
||||
|
||||
def insert_text(self, text):
|
||||
self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
|
||||
self.execute('INSERT INTO fts_table(t) VALUES (?)', (unicode_normalize(text),))
|
||||
|
||||
def term_row_counts(self):
|
||||
return dict(self.execute('SELECT term,doc FROM fts_row'))
|
||||
@ -46,14 +47,14 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
||||
f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "…", {snippet_size})'
|
||||
' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK'
|
||||
)
|
||||
return list(self.execute(stmt, (query,)))
|
||||
return list(self.execute(stmt, (unicode_normalize(query),)))
|
||||
|
||||
|
||||
def tokenize(text, flags=None, remove_diacritics=True):
|
||||
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
|
||||
if flags is None:
|
||||
flags = FTS5_TOKENIZE_DOCUMENT
|
||||
return tokenize(text, remove_diacritics, flags)
|
||||
return tokenize(unicode_normalize(text), remove_diacritics, flags)
|
||||
|
||||
|
||||
class FTSTest(BaseTest):
|
||||
|
Loading…
x
Reference in New Issue
Block a user