mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Ensure text fed to the FTS engine is in NFKC form
This commit is contained in:
parent
52a87af143
commit
6f7454f1ad
@ -11,6 +11,11 @@ from polyglot.builtins import itervalues
|
|||||||
no_cfi_sort_key = cfi_sort_key('/99999999')
|
no_cfi_sort_key = cfi_sort_key('/99999999')
|
||||||
|
|
||||||
|
|
||||||
|
def unicode_normalize(text):
|
||||||
|
from unicodedata import normalize
|
||||||
|
return normalize('NFKC', text)
|
||||||
|
|
||||||
|
|
||||||
def bookmark_sort_key(b):
|
def bookmark_sort_key(b):
|
||||||
if b.get('pos_type') == 'epubcfi':
|
if b.get('pos_type') == 'epubcfi':
|
||||||
return cfi_sort_key(b['pos'], only_path=False)
|
return cfi_sort_key(b['pos'], only_path=False)
|
||||||
@ -125,4 +130,4 @@ def annot_db_data(annot):
|
|||||||
notes = annot.get('notes') or ''
|
notes = annot.get('notes') or ''
|
||||||
if notes:
|
if notes:
|
||||||
text += '\n\x1f\n' + notes
|
text += '\n\x1f\n' + notes
|
||||||
return aid, text
|
return aid, unicode_normalize(text)
|
||||||
|
@ -19,7 +19,7 @@ from calibre.constants import (iswindows, filesystem_encoding,
|
|||||||
preferred_encoding)
|
preferred_encoding)
|
||||||
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
|
||||||
from calibre.db import SPOOL_SIZE
|
from calibre.db import SPOOL_SIZE
|
||||||
from calibre.db.annotations import annot_db_data
|
from calibre.db.annotations import annot_db_data, unicode_normalize
|
||||||
from calibre.db.schema_upgrades import SchemaUpgrade
|
from calibre.db.schema_upgrades import SchemaUpgrade
|
||||||
from calibre.db.delete_service import delete_service
|
from calibre.db.delete_service import delete_service
|
||||||
from calibre.db.errors import NoSuchFormat
|
from calibre.db.errors import NoSuchFormat
|
||||||
@ -1801,6 +1801,7 @@ class DB(object):
|
|||||||
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type,
|
fts_engine_query, use_stemming, highlight_start, highlight_end, snippet_size, annotation_type,
|
||||||
restrict_to_book_ids, restrict_to_user, ignore_removed=False
|
restrict_to_book_ids, restrict_to_user, ignore_removed=False
|
||||||
):
|
):
|
||||||
|
fts_engine_query = unicode_normalize(fts_engine_query)
|
||||||
fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts'
|
fts_table = 'annotations_fts_stemmed' if use_stemming else 'annotations_fts'
|
||||||
text = 'annotations.searchable_text'
|
text = 'annotations.searchable_text'
|
||||||
if highlight_start is not None and highlight_end is not None:
|
if highlight_start is not None and highlight_end is not None:
|
||||||
|
@ -9,6 +9,7 @@ from apsw import Connection
|
|||||||
|
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins
|
||||||
from calibre.db.tests.base import BaseTest
|
from calibre.db.tests.base import BaseTest
|
||||||
|
from calibre.db.annotations import unicode_normalize
|
||||||
|
|
||||||
|
|
||||||
def print(*args, **kwargs):
|
def print(*args, **kwargs):
|
||||||
@ -35,7 +36,7 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
|||||||
return self.cursor().execute(*a)
|
return self.cursor().execute(*a)
|
||||||
|
|
||||||
def insert_text(self, text):
|
def insert_text(self, text):
|
||||||
self.execute('INSERT INTO fts_table(t) VALUES (?)', (text,))
|
self.execute('INSERT INTO fts_table(t) VALUES (?)', (unicode_normalize(text),))
|
||||||
|
|
||||||
def term_row_counts(self):
|
def term_row_counts(self):
|
||||||
return dict(self.execute('SELECT term,doc FROM fts_row'))
|
return dict(self.execute('SELECT term,doc FROM fts_row'))
|
||||||
@ -46,14 +47,14 @@ CREATE VIRTUAL TABLE fts_row USING fts5vocab(fts_table, row);
|
|||||||
f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "…", {snippet_size})'
|
f'SELECT snippet(fts_table, 0, "{highlight_start}", "{highlight_end}", "…", {snippet_size})'
|
||||||
' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK'
|
' FROM fts_table WHERE fts_table MATCH ? ORDER BY RANK'
|
||||||
)
|
)
|
||||||
return list(self.execute(stmt, (query,)))
|
return list(self.execute(stmt, (unicode_normalize(query),)))
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, flags=None, remove_diacritics=True):
|
def tokenize(text, flags=None, remove_diacritics=True):
|
||||||
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
|
from calibre_extensions.sqlite_extension import tokenize, FTS5_TOKENIZE_DOCUMENT
|
||||||
if flags is None:
|
if flags is None:
|
||||||
flags = FTS5_TOKENIZE_DOCUMENT
|
flags = FTS5_TOKENIZE_DOCUMENT
|
||||||
return tokenize(text, remove_diacritics, flags)
|
return tokenize(unicode_normalize(text), remove_diacritics, flags)
|
||||||
|
|
||||||
|
|
||||||
class FTSTest(BaseTest):
|
class FTSTest(BaseTest):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user