Full text search: Also index text in ZIP and RAR archives. These files are viewable in the viewer, so full text search should also index them, to avoid surprising behavior. Fixes #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)

This commit is contained in:
Kovid Goyal 2025-03-06 09:00:34 +05:30
parent 1fe527e351
commit 9bbff10bfc
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,11 +2,13 @@
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net> # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import contextlib
import os import os
import re import re
import unicodedata import unicodedata
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks.conversion.archives import ARCHIVE_FMTS, unarchive
from calibre.ebooks.oeb.base import XPNSMAP, barename from calibre.ebooks.oeb.base import XPNSMAP, barename
from calibre.ebooks.oeb.iterator.book import extract_book from calibre.ebooks.oeb.iterator.book import extract_book
from calibre.ebooks.oeb.polish.container import Container as ContainerBase from calibre.ebooks.oeb.polish.container import Container as ContainerBase
@ -55,8 +57,10 @@ def to_text(container, name):
def is_fmt_ok(input_fmt): def is_fmt_ok(input_fmt):
input_fmt = input_fmt.upper() input_fmt = input_fmt.upper()
input_plugin = plugin_for_input_format(input_fmt) input_plugin = plugin_for_input_format(input_fmt)
if not input_plugin:
return False
is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
if not input_plugin or is_comic: if is_comic:
return False return False
return input_plugin return input_plugin
@ -79,13 +83,22 @@ def extract_text(pathtoebook):
input_fmt = pathtoebook.rpartition('.')[-1].upper() input_fmt = pathtoebook.rpartition('.')[-1].upper()
ans = '' ans = ''
input_plugin = is_fmt_ok(input_fmt) input_plugin = is_fmt_ok(input_fmt)
if not input_plugin: with contextlib.ExitStack() as exit_stack:
return ans if not input_plugin:
input_plugin = plugin_for_input_format(input_fmt) if input_fmt.lower() in ARCHIVE_FMTS:
if input_fmt == 'PDF': try:
ans = pdftotext(pathtoebook) tdir = exit_stack.enter_context(TemporaryDirectory())
else: pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
with TemporaryDirectory() as tdir: input_fmt = input_fmt.upper()
except Exception:
return ans
else:
return ans
input_plugin = plugin_for_input_format(input_fmt)
if input_fmt == 'PDF':
ans = pdftotext(pathtoebook)
else:
tdir = exit_stack.enter_context(TemporaryDirectory())
texts = [] texts = []
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
input_plugin = plugin_for_input_format(input_fmt) input_plugin = plugin_for_input_format(input_fmt)
@ -96,7 +109,7 @@ def extract_text(pathtoebook):
for name, is_linear in container.spine_names: for name, is_linear in container.spine_names:
texts.extend(to_text(container, name)) texts.extend(to_text(container, name))
ans = '\n\n\n'.join(texts) ans = '\n\n\n'.join(texts)
return unicodedata.normalize('NFC', ans).replace('\u00ad', '') return unicodedata.normalize('NFC', ans).replace('\u00ad', '')
def main(pathtoebook): def main(pathtoebook):