From 9bbff10bfc212fac3dd412de6df225555e3e2f1f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 6 Mar 2025 09:00:34 +0530 Subject: [PATCH] Full text search: Also index text in ZIP and RAR archives. These files are viewable in the viewer, so full text search should also index them, to avoid surprising behavior. Fixes #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891) --- src/calibre/db/fts/text.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py index 432d471704..9f009b7a43 100644 --- a/src/calibre/db/fts/text.py +++ b/src/calibre/db/fts/text.py @@ -2,11 +2,13 @@ # License: GPL v3 Copyright: 2022, Kovid Goyal +import contextlib import os import re import unicodedata from calibre.customize.ui import plugin_for_input_format +from calibre.ebooks.conversion.archives import ARCHIVE_FMTS, unarchive from calibre.ebooks.oeb.base import XPNSMAP, barename from calibre.ebooks.oeb.iterator.book import extract_book from calibre.ebooks.oeb.polish.container import Container as ContainerBase @@ -55,8 +57,10 @@ def to_text(container, name): def is_fmt_ok(input_fmt): input_fmt = input_fmt.upper() input_plugin = plugin_for_input_format(input_fmt) + if not input_plugin: + return False is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) - if not input_plugin or is_comic: + if is_comic: return False return input_plugin @@ -79,13 +83,22 @@ def extract_text(pathtoebook): input_fmt = pathtoebook.rpartition('.')[-1].upper() ans = '' input_plugin = is_fmt_ok(input_fmt) - if not input_plugin: - return ans - input_plugin = plugin_for_input_format(input_fmt) - if input_fmt == 'PDF': - ans = pdftotext(pathtoebook) - else: - with TemporaryDirectory() as tdir: + with contextlib.ExitStack() as exit_stack: + if not input_plugin: + if input_fmt.lower() in ARCHIVE_FMTS: + try: + tdir = exit_stack.enter_context(TemporaryDirectory()) + pathtoebook, input_fmt = unarchive(pathtoebook, tdir) + input_fmt = input_fmt.upper() + except Exception: + return ans + else: + return ans + input_plugin = plugin_for_input_format(input_fmt) + if input_fmt == 'PDF': + ans = pdftotext(pathtoebook) + else: + tdir = exit_stack.enter_context(TemporaryDirectory()) texts = [] book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) input_plugin = plugin_for_input_format(input_fmt) @@ -96,7 +109,7 @@ def extract_text(pathtoebook): for name, is_linear in container.spine_names: texts.extend(to_text(container, name)) ans = '\n\n\n'.join(texts) - return unicodedata.normalize('NFC', ans).replace('\u00ad', '') + return unicodedata.normalize('NFC', ans).replace('\u00ad', '') def main(pathtoebook):