From a35ff73ced0c6b1189710639f9cf4f962042d3f6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 20 Jul 2022 07:33:39 +0530
Subject: [PATCH] Full text search: Ignore soft hyphens when extracting
 searchable text from books. Note that you will have to re-index your library
 to take advantage of this.

---
 src/calibre/db/fts/text.py      | 27 ++++++++++++++-------------
 src/calibre/db/tests/fts_api.py |  2 +-
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py
index faff0fd618..70458f9b35 100644
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@@ -83,19 +83,20 @@ def extract_text(pathtoebook):
         return ans
     input_plugin = plugin_for_input_format(input_fmt)
     if input_fmt == 'PDF':
-        return pdftotext(pathtoebook)
-    with TemporaryDirectory() as tdir:
-        texts = []
-        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
-        input_plugin = plugin_for_input_format(input_fmt)
-        is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
-        if is_comic:
-            return ''
-        container = SimpleContainer(tdir, opfpath, default_log)
-        for name, is_linear in container.spine_names:
-            texts.extend(to_text(container, name))
-        ans = '\n\n\n'.join(texts)
-    return unicodedata.normalize('NFC', ans)
+        ans = pdftotext(pathtoebook)
+    else:
+        with TemporaryDirectory() as tdir:
+            texts = []
+            book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
+            input_plugin = plugin_for_input_format(input_fmt)
+            is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
+            if is_comic:
+                return ''
+            container = SimpleContainer(tdir, opfpath, default_log)
+            for name, is_linear in container.spine_names:
+                texts.extend(to_text(container, name))
+            ans = '\n\n\n'.join(texts)
+    return unicodedata.normalize('NFC', ans).replace('\u00ad', '')
 
 
 def main(pathtoebook):
diff --git a/src/calibre/db/tests/fts_api.py b/src/calibre/db/tests/fts_api.py
index 55cf6bde21..93de111f52 100644
--- a/src/calibre/db/tests/fts_api.py
+++ b/src/calibre/db/tests/fts_api.py
@@ -98,7 +98,7 @@ class FTSAPITest(BaseTest):
 
         # check enabling scans pre-exisintg
         cache = self.new_library()
-        cache.add_format(1, 'TXTZ', self.make_txtz(b'a test text'))
+        cache.add_format(1, 'TXTZ', self.make_txtz('a test te\u00adxt'.encode('utf-8')))
         fts = cache.enable_fts()
         self.wait_for_fts_to_finish(fts)
         check(id=1, book=1, format='TXTZ', searchable_text='a test text')