mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Full text search: Also index text in ZIP and RAR archives. These files are viewable in the viewer, so full text search should also index them, to avoid surprising behavior. Fixes #2100891 [[Enhancement] Full-text search for doc/docx/zip (containing htm/html)](https://bugs.launchpad.net/calibre/+bug/2100891)
This commit is contained in:
parent
1fe527e351
commit
9bbff10bfc
@ -2,11 +2,13 @@
|
|||||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
import contextlib
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
|
from calibre.ebooks.conversion.archives import ARCHIVE_FMTS, unarchive
|
||||||
from calibre.ebooks.oeb.base import XPNSMAP, barename
|
from calibre.ebooks.oeb.base import XPNSMAP, barename
|
||||||
from calibre.ebooks.oeb.iterator.book import extract_book
|
from calibre.ebooks.oeb.iterator.book import extract_book
|
||||||
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
||||||
@ -55,8 +57,10 @@ def to_text(container, name):
|
|||||||
def is_fmt_ok(input_fmt):
|
def is_fmt_ok(input_fmt):
|
||||||
input_fmt = input_fmt.upper()
|
input_fmt = input_fmt.upper()
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
|
if not input_plugin:
|
||||||
|
return False
|
||||||
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
||||||
if not input_plugin or is_comic:
|
if is_comic:
|
||||||
return False
|
return False
|
||||||
return input_plugin
|
return input_plugin
|
||||||
|
|
||||||
@ -79,13 +83,22 @@ def extract_text(pathtoebook):
|
|||||||
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
||||||
ans = ''
|
ans = ''
|
||||||
input_plugin = is_fmt_ok(input_fmt)
|
input_plugin = is_fmt_ok(input_fmt)
|
||||||
if not input_plugin:
|
with contextlib.ExitStack() as exit_stack:
|
||||||
return ans
|
if not input_plugin:
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
if input_fmt.lower() in ARCHIVE_FMTS:
|
||||||
if input_fmt == 'PDF':
|
try:
|
||||||
ans = pdftotext(pathtoebook)
|
tdir = exit_stack.enter_context(TemporaryDirectory())
|
||||||
else:
|
pathtoebook, input_fmt = unarchive(pathtoebook, tdir)
|
||||||
with TemporaryDirectory() as tdir:
|
input_fmt = input_fmt.upper()
|
||||||
|
except Exception:
|
||||||
|
return ans
|
||||||
|
else:
|
||||||
|
return ans
|
||||||
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
|
if input_fmt == 'PDF':
|
||||||
|
ans = pdftotext(pathtoebook)
|
||||||
|
else:
|
||||||
|
tdir = exit_stack.enter_context(TemporaryDirectory())
|
||||||
texts = []
|
texts = []
|
||||||
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
@ -96,7 +109,7 @@ def extract_text(pathtoebook):
|
|||||||
for name, is_linear in container.spine_names:
|
for name, is_linear in container.spine_names:
|
||||||
texts.extend(to_text(container, name))
|
texts.extend(to_text(container, name))
|
||||||
ans = '\n\n\n'.join(texts)
|
ans = '\n\n\n'.join(texts)
|
||||||
return unicodedata.normalize('NFC', ans).replace('\u00ad', '')
|
return unicodedata.normalize('NFC', ans).replace('\u00ad', '')
|
||||||
|
|
||||||
|
|
||||||
def main(pathtoebook):
|
def main(pathtoebook):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user