From 58bde2e3040ca04d7a4c15539daef0165ce6166c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 14 Feb 2022 21:22:03 +0530
Subject: [PATCH] Basic implementation of extracting searchable text from HTML

---
 src/calibre/db/fts/text.py                    | 74 +++++++++++++++++++
 src/calibre/db/tests/fts_api.py               | 13 ++++
 src/calibre/ebooks/oeb/polish/utils.py        |  7 ++
 .../gui2/tweak_book/editor/smarts/html.py     |  8 +-
 4 files changed, 95 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/db/fts/text.py
diff --git a/src/calibre/db/fts/text.py b/src/calibre/db/fts/text.py
new file mode 100644
index 0000000000..4c9ca75a71
--- /dev/null
+++ b/src/calibre/db/fts/text.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
+
+
+import re
+import unicodedata
+
+from calibre.customize.ui import plugin_for_input_format
+from calibre.ebooks.oeb.base import XPNSMAP, barename
+from calibre.ebooks.oeb.iterator.book import extract_book
+from calibre.ebooks.oeb.polish.container import Container as ContainerBase
+from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.logging import default_log
+
+
+class SimpleContainer(ContainerBase):
+
+    tweak_mode = True
+
+
+skipped_tags = frozenset({'style', 'title', 'script', 'head', 'img', 'svg', 'math'})
+
+
+def tag_to_text(tag):
+    if tag.text:
+        yield tag.text
+    for child in tag:
+        q = barename(child.tag).lower() if isinstance(child.tag, str) else ''
+        if not q or q in skipped_tags:
+            if child.tail:
+                yield child.tail
+        else:
+            if q in BLOCK_TAG_NAMES:
+                yield '\n\n'
+            yield from tag_to_text(child)
+    if tag.tail:
+        yield tag.tail
+
+
+def html_to_text(root):
+    pat = re.compile(r'\n{3,}')
+    for body in root.xpath('h:body', namespaces=XPNSMAP):
+        body.tail = ''
+        yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip())
+
+
+def to_text(container, name):
+    root = container.parsed(name)
+    yield from html_to_text(root)
+
+
+def extract_text(pathtoebook):
+    input_fmt = pathtoebook.rpartition('.')[-1].upper()
+    input_plugin = plugin_for_input_format(input_fmt)
+    ans = ''
+    if not input_plugin:
+        return ans
+    is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
+    if is_comic:
+        return ans
+    with TemporaryDirectory() as tdir:
+        texts = []
+        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
+        input_plugin = plugin_for_input_format(input_fmt)
+        is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
+        if is_comic:
+            return ''
+        container = SimpleContainer(tdir, opfpath, default_log)
+        for name, is_linear in container.spine_names:
+            texts.extend(to_text(container, name))
+        ans = '\n\n\n'.join(texts)
+    return unicodedata.normalize('NFC', ans)
diff --git a/src/calibre/db/tests/fts_api.py b/src/calibre/db/tests/fts_api.py
index b18b42150f..f9336aee10 100644
--- a/src/calibre/db/tests/fts_api.py
+++ b/src/calibre/db/tests/fts_api.py
@@ -7,6 +7,7 @@ import sys
 from io import BytesIO
 
 from calibre.db.tests.base import BaseTest
+from calibre.db.fts.text import html_to_text
 
 
 def print(*args, **kwargs):
@@ -50,6 +51,18 @@ class FTSAPITest(BaseTest):
         fts.add_text(2, 'ADDED', 'data2')
         self.ae(fts.all_currently_dirty(), [])
 
+    def test_fts_to_text(self):
+        from calibre.ebooks.oeb.polish.parsing import parse
+        html = '''
+<html><body>
+<div>first_para</div><p>second_para</p>
+<p>some <i>itali</i>c t<!- c -->ext</p>
+<div>nested<p>blocks</p></div>
+</body></html>
+'''
+        root = parse(html)
+        self.ae(tuple(html_to_text(root)), ('first_para\n\nsecond_para\n\nsome italic text\n\nnested\n\nblocks',))
+
 
 def find_tests():
     import unittest
diff --git a/src/calibre/ebooks/oeb/polish/utils.py b/src/calibre/ebooks/oeb/polish/utils.py
index 0ec5ffc25b..d27626541d 100644
--- a/src/calibre/ebooks/oeb/polish/utils.py
+++ b/src/calibre/ebooks/oeb/polish/utils.py
@@ -10,6 +10,13 @@ from bisect import bisect
 from calibre import guess_type as _guess_type, replace_entities
 
 
+BLOCK_TAG_NAMES = frozenset((
+    'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
+    'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
+    'ol', 'li', 'body', 'td', 'th'))
+
+
 def guess_type(x):
     return _guess_type(x)[0] or 'application/octet-stream'
 
diff --git a/src/calibre/gui2/tweak_book/editor/smarts/html.py b/src/calibre/gui2/tweak_book/editor/smarts/html.py
index f64c58b8d1..402b2f2ca4 100644
--- a/src/calibre/gui2/tweak_book/editor/smarts/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py
@@ -15,6 +15,7 @@ from qt.core import Qt, QTextCursor, QTextEdit
 from calibre import prepare_string_for_xml, xml_entity_to_unicode
 from calibre.ebooks.oeb.base import css_text
 from calibre.ebooks.oeb.polish.container import OEB_DOCS
+from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
 from calibre.gui2 import error_dialog
 from calibre.gui2.tweak_book import current_container, tprefs
 from calibre.gui2.tweak_book.editor.smarts import NullSmarts
@@ -284,13 +285,6 @@ def ensure_not_within_tag_definition(cursor, forward=True):
     return False
 
 
-BLOCK_TAG_NAMES = frozenset((
-    'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
-    'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
-    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
-    'ol', 'li', 'body', 'td', 'th'))
-
-
 def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES):
     while True:
         tag = find_closest_containing_tag(block, offset)