Basic implementation of extracting searchable text from HTML

2025-07-07 10:14:46 -04:00 · 2022-02-14 21:22:03 +05:30 · 2022-02-14 21:22:03 +05:30 · 58bde2e304
commit 58bde2e304
parent b66c72cc15
4 changed files with 95 additions and 7 deletions
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@ -0,0 +1,74 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
 import re
 import unicodedata
 from calibre.customize.ui import plugin_for_input_format
 from calibre.ebooks.oeb.base import XPNSMAP, barename
 from calibre.ebooks.oeb.iterator.book import extract_book
 from calibre.ebooks.oeb.polish.container import Container as ContainerBase
 from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.logging import default_log
 class SimpleContainer(ContainerBase):
    tweak_mode = True
 skipped_tags = frozenset({'style', 'title', 'script', 'head', 'img', 'svg', 'math'})
 def tag_to_text(tag):
    if tag.text:
        yield tag.text
    for child in tag:
        q = barename(child.tag).lower() if isinstance(child.tag, str) else ''
        if not q or q in skipped_tags:
            if child.tail:
                yield child.tail
        else:
            if q in BLOCK_TAG_NAMES:
                yield '\n\n'
            yield from tag_to_text(child)
    if tag.tail:
        yield tag.tail
 def html_to_text(root):
    pat = re.compile(r'\n{3,}')
    for body in root.xpath('h:body', namespaces=XPNSMAP):
        body.tail = ''
        yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip())
 def to_text(container, name):
    root = container.parsed(name)
    yield from html_to_text(root)
 def extract_text(pathtoebook):
    input_fmt = pathtoebook.rpartition('.')[-1].upper()
    input_plugin = plugin_for_input_format(input_fmt)
    ans = ''
    if not input_plugin:
        return ans
    is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
    if is_comic:
        return ans
    with TemporaryDirectory() as tdir:
        texts = []
        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
        input_plugin = plugin_for_input_format(input_fmt)
        is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
        if is_comic:
            return ''
        container = SimpleContainer(tdir, opfpath, default_log)
        for name, is_linear in container.spine_names:
            texts.extend(to_text(container, name))
        ans = '\n\n\n'.join(texts)
    return unicodedata.normalize('NFC', ans)
--- a/src/calibre/db/tests/fts_api.py
+++ b/src/calibre/db/tests/fts_api.py
@ -7,6 +7,7 @@ import sys
 from io import BytesIO
 from calibre.db.tests.base import BaseTest
 from calibre.db.fts.text import html_to_text
 def print(*args, **kwargs):
@ -50,6 +51,18 @@ class FTSAPITest(BaseTest):
        fts.add_text(2, 'ADDED', 'data2')
        self.ae(fts.all_currently_dirty(), [])
    def test_fts_to_text(self):
        from calibre.ebooks.oeb.polish.parsing import parse
        html = '''
 <html><body>
 <div>first_para</div><p>second_para</p>
 <p>some <i>itali</i>c t<!- c -->ext</p>
 <div>nested<p>blocks</p></div>
 </body></html>
 '''
        root = parse(html)
        self.ae(tuple(html_to_text(root)), ('first_para\n\nsecond_para\n\nsome italic text\n\nnested\n\nblocks',))
 def find_tests():
    import unittest
--- a/src/calibre/ebooks/oeb/polish/utils.py
+++ b/src/calibre/ebooks/oeb/polish/utils.py
@ -10,6 +10,13 @@ from bisect import bisect
 from calibre import guess_type as _guess_type, replace_entities
 BLOCK_TAG_NAMES = frozenset((
    'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
    'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
    'ol', 'li', 'body', 'td', 'th'))
 def guess_type(x):
    return _guess_type(x)[0] or 'application/octet-stream'
--- a/src/calibre/gui2/tweak_book/editor/smarts/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py
@ -15,6 +15,7 @@ from qt.core import Qt, QTextCursor, QTextEdit
 from calibre import prepare_string_for_xml, xml_entity_to_unicode
 from calibre.ebooks.oeb.base import css_text
 from calibre.ebooks.oeb.polish.container import OEB_DOCS
 from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
 from calibre.gui2 import error_dialog
 from calibre.gui2.tweak_book import current_container, tprefs
 from calibre.gui2.tweak_book.editor.smarts import NullSmarts
@ -284,13 +285,6 @@ def ensure_not_within_tag_definition(cursor, forward=True):
    return False
 BLOCK_TAG_NAMES = frozenset((
    'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
    'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
    'ol', 'li', 'body', 'td', 'th'))
 def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES):
    while True:
        tag = find_closest_containing_tag(block, offset)