mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Full text search: Ignore soft hyphens when extracting searchable text from books. Note that you will have to re-index your library to take advantage of this.
This commit is contained in:
parent
93024884de
commit
a35ff73ced
@ -83,19 +83,20 @@ def extract_text(pathtoebook):
|
|||||||
return ans
|
return ans
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
if input_fmt == 'PDF':
|
if input_fmt == 'PDF':
|
||||||
return pdftotext(pathtoebook)
|
ans = pdftotext(pathtoebook)
|
||||||
with TemporaryDirectory() as tdir:
|
else:
|
||||||
texts = []
|
with TemporaryDirectory() as tdir:
|
||||||
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
texts = []
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
||||||
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
if is_comic:
|
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
||||||
return ''
|
if is_comic:
|
||||||
container = SimpleContainer(tdir, opfpath, default_log)
|
return ''
|
||||||
for name, is_linear in container.spine_names:
|
container = SimpleContainer(tdir, opfpath, default_log)
|
||||||
texts.extend(to_text(container, name))
|
for name, is_linear in container.spine_names:
|
||||||
ans = '\n\n\n'.join(texts)
|
texts.extend(to_text(container, name))
|
||||||
return unicodedata.normalize('NFC', ans)
|
ans = '\n\n\n'.join(texts)
|
||||||
|
return unicodedata.normalize('NFC', ans).replace('\u00ad', '')
|
||||||
|
|
||||||
|
|
||||||
def main(pathtoebook):
|
def main(pathtoebook):
|
||||||
|
@ -98,7 +98,7 @@ class FTSAPITest(BaseTest):
|
|||||||
|
|
||||||
# check enabling scans pre-exisintg
|
# check enabling scans pre-exisintg
|
||||||
cache = self.new_library()
|
cache = self.new_library()
|
||||||
cache.add_format(1, 'TXTZ', self.make_txtz(b'a test text'))
|
cache.add_format(1, 'TXTZ', self.make_txtz('a test te\u00adxt'.encode('utf-8')))
|
||||||
fts = cache.enable_fts()
|
fts = cache.enable_fts()
|
||||||
self.wait_for_fts_to_finish(fts)
|
self.wait_for_fts_to_finish(fts)
|
||||||
check(id=1, book=1, format='TXTZ', searchable_text='a test text')
|
check(id=1, book=1, format='TXTZ', searchable_text='a test text')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user