From 58bde2e3040ca04d7a4c15539daef0165ce6166c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 14 Feb 2022 21:22:03 +0530 Subject: [PATCH] Basic implementation of extracting searchable text from HTML --- src/calibre/db/fts/text.py | 74 +++++++++++++++++++ src/calibre/db/tests/fts_api.py | 13 ++++ src/calibre/ebooks/oeb/polish/utils.py | 7 ++ .../gui2/tweak_book/editor/smarts/html.py | 8 +- 4 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 src/calibre/db/fts/text.py diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py new file mode 100644 index 0000000000..4c9ca75a71 --- /dev/null +++ b/src/calibre/db/fts/text.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2022, Kovid Goyal + + +import re +import unicodedata + +from calibre.customize.ui import plugin_for_input_format +from calibre.ebooks.oeb.base import XPNSMAP, barename +from calibre.ebooks.oeb.iterator.book import extract_book +from calibre.ebooks.oeb.polish.container import Container as ContainerBase +from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.logging import default_log + + +class SimpleContainer(ContainerBase): + + tweak_mode = True + + +skipped_tags = frozenset({'style', 'title', 'script', 'head', 'img', 'svg', 'math'}) + + +def tag_to_text(tag): + if tag.text: + yield tag.text + for child in tag: + q = barename(child.tag).lower() if isinstance(child.tag, str) else '' + if not q or q in skipped_tags: + if child.tail: + yield child.tail + else: + if q in BLOCK_TAG_NAMES: + yield '\n\n' + yield from tag_to_text(child) + if tag.tail: + yield tag.tail + + +def html_to_text(root): + pat = re.compile(r'\n{3,}') + for body in root.xpath('h:body', namespaces=XPNSMAP): + body.tail = '' + yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip()) + + +def to_text(container, name): + root = container.parsed(name) + yield from html_to_text(root) + + +def extract_text(pathtoebook): + input_fmt = pathtoebook.rpartition('.')[-1].upper() + input_plugin = plugin_for_input_format(input_fmt) + ans = '' + if not input_plugin: + return ans + is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) + if is_comic: + return ans + with TemporaryDirectory() as tdir: + texts = [] + book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log) + input_plugin = plugin_for_input_format(input_fmt) + is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) + if is_comic: + return '' + container = SimpleContainer(tdir, opfpath, default_log) + for name, is_linear in container.spine_names: + texts.extend(to_text(container, name)) + ans = '\n\n\n'.join(texts) + return unicodedata.normalize('NFC', ans) diff --git a/src/calibre/db/tests/fts_api.py b/src/calibre/db/tests/fts_api.py index b18b42150f..f9336aee10 100644 --- a/src/calibre/db/tests/fts_api.py +++ b/src/calibre/db/tests/fts_api.py @@ -7,6 +7,7 @@ import sys from io import BytesIO from calibre.db.tests.base import BaseTest +from calibre.db.fts.text import html_to_text def print(*args, **kwargs): @@ -50,6 +51,18 @@ class FTSAPITest(BaseTest): fts.add_text(2, 'ADDED', 'data2') self.ae(fts.all_currently_dirty(), []) + def test_fts_to_text(self): + from calibre.ebooks.oeb.polish.parsing import parse + html = ''' + +
first_para

second_para

+

some italic text

+
nested

blocks

+ +''' + root = parse(html) + self.ae(tuple(html_to_text(root)), ('first_para\n\nsecond_para\n\nsome italic text\n\nnested\n\nblocks',)) + def find_tests(): import unittest diff --git a/src/calibre/ebooks/oeb/polish/utils.py b/src/calibre/ebooks/oeb/polish/utils.py index 0ec5ffc25b..d27626541d 100644 --- a/src/calibre/ebooks/oeb/polish/utils.py +++ b/src/calibre/ebooks/oeb/polish/utils.py @@ -10,6 +10,13 @@ from bisect import bisect from calibre import guess_type as _guess_type, replace_entities +BLOCK_TAG_NAMES = frozenset(( + 'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset', + 'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul', + 'ol', 'li', 'body', 'td', 'th')) + + def guess_type(x): return _guess_type(x)[0] or 'application/octet-stream' diff --git a/src/calibre/gui2/tweak_book/editor/smarts/html.py b/src/calibre/gui2/tweak_book/editor/smarts/html.py index f64c58b8d1..402b2f2ca4 100644 --- a/src/calibre/gui2/tweak_book/editor/smarts/html.py +++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py @@ -15,6 +15,7 @@ from qt.core import Qt, QTextCursor, QTextEdit from calibre import prepare_string_for_xml, xml_entity_to_unicode from calibre.ebooks.oeb.base import css_text from calibre.ebooks.oeb.polish.container import OEB_DOCS +from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES from calibre.gui2 import error_dialog from calibre.gui2.tweak_book import current_container, tprefs from calibre.gui2.tweak_book.editor.smarts import NullSmarts @@ -284,13 +285,6 @@ def ensure_not_within_tag_definition(cursor, forward=True): return False -BLOCK_TAG_NAMES = frozenset(( - 'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset', - 'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul', - 'ol', 'li', 'body', 'td', 'th')) - - def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES): while True: tag = find_closest_containing_tag(block, offset)