Basic implementation of extracting searchable text from HTML

2025-07-07 18:24:30 -04:00 · 2022-02-14 21:22:03 +05:30 · 2022-02-14 21:22:03 +05:30 · 58bde2e304
commit 58bde2e304
parent b66c72cc15
4 changed files with 95 additions and 7 deletions
--- a/src/calibre/db/fts/text.py
+++ b/src/calibre/db/fts/text.py
@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
+
+
+import re
+import unicodedata
+
+from calibre.customize.ui import plugin_for_input_format
+from calibre.ebooks.oeb.base import XPNSMAP, barename
+from calibre.ebooks.oeb.iterator.book import extract_book
+from calibre.ebooks.oeb.polish.container import Container as ContainerBase
+from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.logging import default_log
+
+
+class SimpleContainer(ContainerBase):
+
+    tweak_mode = True
+
+
+skipped_tags = frozenset({'style', 'title', 'script', 'head', 'img', 'svg', 'math'})
+
+
+def tag_to_text(tag):
+    if tag.text:
+        yield tag.text
+    for child in tag:
+        q = barename(child.tag).lower() if isinstance(child.tag, str) else ''
+        if not q or q in skipped_tags:
+            if child.tail:
+                yield child.tail
+        else:
+            if q in BLOCK_TAG_NAMES:
+                yield '\n\n'
+            yield from tag_to_text(child)
+    if tag.tail:
+        yield tag.tail
+
+
+def html_to_text(root):
+    pat = re.compile(r'\n{3,}')
+    for body in root.xpath('h:body', namespaces=XPNSMAP):
+        body.tail = ''
+        yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip())
+
+
+def to_text(container, name):
+    root = container.parsed(name)
+    yield from html_to_text(root)
+
+
+def extract_text(pathtoebook):
+    input_fmt = pathtoebook.rpartition('.')[-1].upper()
+    input_plugin = plugin_for_input_format(input_fmt)
+    ans = ''
+    if not input_plugin:
+        return ans
+    is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
+    if is_comic:
+        return ans
+    with TemporaryDirectory() as tdir:
+        texts = []
+        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
+        input_plugin = plugin_for_input_format(input_fmt)
+        is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
+        if is_comic:
+            return ''
+        container = SimpleContainer(tdir, opfpath, default_log)
+        for name, is_linear in container.spine_names:
+            texts.extend(to_text(container, name))
+        ans = '\n\n\n'.join(texts)
+    return unicodedata.normalize('NFC', ans)
--- a/src/calibre/db/tests/fts_api.py
+++ b/src/calibre/db/tests/fts_api.py
@ -7,6 +7,7 @@ import sys
 from io import BytesIO

 from calibre.db.tests.base import BaseTest
+from calibre.db.fts.text import html_to_text


 def print(*args, **kwargs):
@ -50,6 +51,18 @@ class FTSAPITest(BaseTest):
        fts.add_text(2, 'ADDED', 'data2')
        self.ae(fts.all_currently_dirty(), [])

+    def test_fts_to_text(self):
+        from calibre.ebooks.oeb.polish.parsing import parse
+        html = '''
+<html><body>
+<div>first_para</div><p>second_para</p>
+<p>some <i>itali</i>c t<!- c -->ext</p>
+<div>nested<p>blocks</p></div>
+</body></html>
+'''
+        root = parse(html)
+        self.ae(tuple(html_to_text(root)), ('first_para\n\nsecond_para\n\nsome italic text\n\nnested\n\nblocks',))
+

 def find_tests():
    import unittest
--- a/src/calibre/ebooks/oeb/polish/utils.py
+++ b/src/calibre/ebooks/oeb/polish/utils.py
@ -10,6 +10,13 @@ from bisect import bisect
 from calibre import guess_type as _guess_type, replace_entities


+BLOCK_TAG_NAMES = frozenset((
+    'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
+    'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
+    'ol', 'li', 'body', 'td', 'th'))
+
+
 def guess_type(x):
    return _guess_type(x)[0] or 'application/octet-stream'

--- a/src/calibre/gui2/tweak_book/editor/smarts/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py
@ -15,6 +15,7 @@ from qt.core import Qt, QTextCursor, QTextEdit
 from calibre import prepare_string_for_xml, xml_entity_to_unicode
 from calibre.ebooks.oeb.base import css_text
 from calibre.ebooks.oeb.polish.container import OEB_DOCS
+from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
 from calibre.gui2 import error_dialog
 from calibre.gui2.tweak_book import current_container, tprefs
 from calibre.gui2.tweak_book.editor.smarts import NullSmarts
@ -284,13 +285,6 @@ def ensure_not_within_tag_definition(cursor, forward=True):
    return False


-BLOCK_TAG_NAMES = frozenset((
-    'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
-    'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
-    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
-    'ol', 'li', 'body', 'td', 'th'))
-
-
 def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES):
    while True:
        tag = find_closest_containing_tag(block, offset)