From a35ff73ced0c6b1189710639f9cf4f962042d3f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jul 2022 07:33:39 +0530 Subject: [PATCH] Full text search: Ignore soft hyphens when extracting searchable text from books. Note that you will have to re-index your library to take advantage of this. --- src/calibre/db/fts/text.py | 27 ++++++++++++++------------- src/calibre/db/tests/fts_api.py | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py index faff0fd618..70458f9b35 100644 --- a/src/calibre/db/fts/text.py +++ b/src/calibre/db/fts/text.py @@ -83,19 +83,20 @@ def extract_text(pathtoebook): return ans input_plugin = plugin_for_input_format(input_fmt) if input_fmt == 'PDF': - return pdftotext(pathtoebook) - with TemporaryDirectory() as tdir: - texts = [] - book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) - input_plugin = plugin_for_input_format(input_fmt) - is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) - if is_comic: - return '' - container = SimpleContainer(tdir, opfpath, default_log) - for name, is_linear in container.spine_names: - texts.extend(to_text(container, name)) - ans = '\n\n\n'.join(texts) - return unicodedata.normalize('NFC', ans) + ans = pdftotext(pathtoebook) + else: + with TemporaryDirectory() as tdir: + texts = [] + book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) + input_plugin = plugin_for_input_format(input_fmt) + is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) + if is_comic: + return '' + container = SimpleContainer(tdir, opfpath, default_log) + for name, is_linear in container.spine_names: + texts.extend(to_text(container, name)) + ans = '\n\n\n'.join(texts) + return unicodedata.normalize('NFC', ans).replace('\u00ad', '') def main(pathtoebook): diff --git a/src/calibre/db/tests/fts_api.py b/src/calibre/db/tests/fts_api.py index 55cf6bde21..93de111f52 100644 --- a/src/calibre/db/tests/fts_api.py +++ b/src/calibre/db/tests/fts_api.py @@ -98,7 +98,7 @@ class FTSAPITest(BaseTest): # check enabling scans pre-exisintg cache = self.new_library() - cache.add_format(1, 'TXTZ', self.make_txtz(b'a test text')) + cache.add_format(1, 'TXTZ', self.make_txtz('a test te\u00adxt'.encode('utf-8'))) fts = cache.enable_fts() self.wait_for_fts_to_finish(fts) check(id=1, book=1, format='TXTZ', searchable_text='a test text')