Full text search: Ignore soft hyphens when extracting searchable text from books. Note that you will have to re-index your library to take advantage of this.

This commit is contained in:
Kovid Goyal 2022-07-20 07:33:39 +05:30
parent 93024884de
commit a35ff73ced
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 15 additions and 14 deletions

View File

@ -83,19 +83,20 @@ def extract_text(pathtoebook):
return ans return ans
input_plugin = plugin_for_input_format(input_fmt) input_plugin = plugin_for_input_format(input_fmt)
if input_fmt == 'PDF': if input_fmt == 'PDF':
return pdftotext(pathtoebook) ans = pdftotext(pathtoebook)
with TemporaryDirectory() as tdir: else:
texts = [] with TemporaryDirectory() as tdir:
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) texts = []
input_plugin = plugin_for_input_format(input_fmt) book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) input_plugin = plugin_for_input_format(input_fmt)
if is_comic: is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
return '' if is_comic:
container = SimpleContainer(tdir, opfpath, default_log) return ''
for name, is_linear in container.spine_names: container = SimpleContainer(tdir, opfpath, default_log)
texts.extend(to_text(container, name)) for name, is_linear in container.spine_names:
ans = '\n\n\n'.join(texts) texts.extend(to_text(container, name))
return unicodedata.normalize('NFC', ans) ans = '\n\n\n'.join(texts)
return unicodedata.normalize('NFC', ans).replace('\u00ad', '')
def main(pathtoebook): def main(pathtoebook):

View File

@ -98,7 +98,7 @@ class FTSAPITest(BaseTest):
# check enabling scans pre-exisintg # check enabling scans pre-exisintg
cache = self.new_library() cache = self.new_library()
cache.add_format(1, 'TXTZ', self.make_txtz(b'a test text')) cache.add_format(1, 'TXTZ', self.make_txtz('a test te\u00adxt'.encode('utf-8')))
fts = cache.enable_fts() fts = cache.enable_fts()
self.wait_for_fts_to_finish(fts) self.wait_for_fts_to_finish(fts)
check(id=1, book=1, format='TXTZ', searchable_text='a test text') check(id=1, book=1, format='TXTZ', searchable_text='a test text')