Basic implementation of extracting searchable text from HTML

This commit is contained in:
Kovid Goyal 2022-02-14 21:22:03 +05:30
parent b66c72cc15
commit 58bde2e304
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 95 additions and 7 deletions

View File

@ -0,0 +1,74 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import re
import unicodedata
from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks.oeb.base import XPNSMAP, barename
from calibre.ebooks.oeb.iterator.book import extract_book
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.logging import default_log
class SimpleContainer(ContainerBase):
tweak_mode = True
skipped_tags = frozenset({'style', 'title', 'script', 'head', 'img', 'svg', 'math'})
def tag_to_text(tag):
if tag.text:
yield tag.text
for child in tag:
q = barename(child.tag).lower() if isinstance(child.tag, str) else ''
if not q or q in skipped_tags:
if child.tail:
yield child.tail
else:
if q in BLOCK_TAG_NAMES:
yield '\n\n'
yield from tag_to_text(child)
if tag.tail:
yield tag.tail
def html_to_text(root):
pat = re.compile(r'\n{3,}')
for body in root.xpath('h:body', namespaces=XPNSMAP):
body.tail = ''
yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip())
def to_text(container, name):
root = container.parsed(name)
yield from html_to_text(root)
def extract_text(pathtoebook):
input_fmt = pathtoebook.rpartition('.')[-1].upper()
input_plugin = plugin_for_input_format(input_fmt)
ans = ''
if not input_plugin:
return ans
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
if is_comic:
return ans
with TemporaryDirectory() as tdir:
texts = []
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
input_plugin = plugin_for_input_format(input_fmt)
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
if is_comic:
return ''
container = SimpleContainer(tdir, opfpath, default_log)
for name, is_linear in container.spine_names:
texts.extend(to_text(container, name))
ans = '\n\n\n'.join(texts)
return unicodedata.normalize('NFC', ans)

View File

@ -7,6 +7,7 @@ import sys
from io import BytesIO
from calibre.db.tests.base import BaseTest
from calibre.db.fts.text import html_to_text
def print(*args, **kwargs):
@ -50,6 +51,18 @@ class FTSAPITest(BaseTest):
fts.add_text(2, 'ADDED', 'data2')
self.ae(fts.all_currently_dirty(), [])
def test_fts_to_text(self):
from calibre.ebooks.oeb.polish.parsing import parse
html = '''
<html><body>
<div>first_para</div><p>second_para</p>
<p>some <i>itali</i>c t<!- c -->ext</p>
<div>nested<p>blocks</p></div>
</body></html>
'''
root = parse(html)
self.ae(tuple(html_to_text(root)), ('first_para\n\nsecond_para\n\nsome italic text\n\nnested\n\nblocks',))
def find_tests():
import unittest

View File

@ -10,6 +10,13 @@ from bisect import bisect
from calibre import guess_type as _guess_type, replace_entities
BLOCK_TAG_NAMES = frozenset((
'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
'ol', 'li', 'body', 'td', 'th'))
def guess_type(x):
return _guess_type(x)[0] or 'application/octet-stream'

View File

@ -15,6 +15,7 @@ from qt.core import Qt, QTextCursor, QTextEdit
from calibre import prepare_string_for_xml, xml_entity_to_unicode
from calibre.ebooks.oeb.base import css_text
from calibre.ebooks.oeb.polish.container import OEB_DOCS
from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
from calibre.gui2 import error_dialog
from calibre.gui2.tweak_book import current_container, tprefs
from calibre.gui2.tweak_book.editor.smarts import NullSmarts
@ -284,13 +285,6 @@ def ensure_not_within_tag_definition(cursor, forward=True):
return False
BLOCK_TAG_NAMES = frozenset((
'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
'ol', 'li', 'body', 'td', 'th'))
def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES):
while True:
tag = find_closest_containing_tag(block, offset)