mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Basic implementation of extracting searchable text from HTML
This commit is contained in:
parent
b66c72cc15
commit
58bde2e304
74
src/calibre/db/fts/text.py
Normal file
74
src/calibre/db/fts/text.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
|
from calibre.ebooks.oeb.base import XPNSMAP, barename
|
||||||
|
from calibre.ebooks.oeb.iterator.book import extract_book
|
||||||
|
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
||||||
|
from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
|
||||||
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleContainer(ContainerBase):
|
||||||
|
|
||||||
|
tweak_mode = True
|
||||||
|
|
||||||
|
|
||||||
|
skipped_tags = frozenset({'style', 'title', 'script', 'head', 'img', 'svg', 'math'})
|
||||||
|
|
||||||
|
|
||||||
|
def tag_to_text(tag):
|
||||||
|
if tag.text:
|
||||||
|
yield tag.text
|
||||||
|
for child in tag:
|
||||||
|
q = barename(child.tag).lower() if isinstance(child.tag, str) else ''
|
||||||
|
if not q or q in skipped_tags:
|
||||||
|
if child.tail:
|
||||||
|
yield child.tail
|
||||||
|
else:
|
||||||
|
if q in BLOCK_TAG_NAMES:
|
||||||
|
yield '\n\n'
|
||||||
|
yield from tag_to_text(child)
|
||||||
|
if tag.tail:
|
||||||
|
yield tag.tail
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(root):
|
||||||
|
pat = re.compile(r'\n{3,}')
|
||||||
|
for body in root.xpath('h:body', namespaces=XPNSMAP):
|
||||||
|
body.tail = ''
|
||||||
|
yield pat.sub('\n\n', ''.join(tag_to_text(body)).strip())
|
||||||
|
|
||||||
|
|
||||||
|
def to_text(container, name):
|
||||||
|
root = container.parsed(name)
|
||||||
|
yield from html_to_text(root)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(pathtoebook):
|
||||||
|
input_fmt = pathtoebook.rpartition('.')[-1].upper()
|
||||||
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
|
ans = ''
|
||||||
|
if not input_plugin:
|
||||||
|
return ans
|
||||||
|
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
||||||
|
if is_comic:
|
||||||
|
return ans
|
||||||
|
with TemporaryDirectory() as tdir:
|
||||||
|
texts = []
|
||||||
|
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, tdir, log=default_log)
|
||||||
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
|
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
||||||
|
if is_comic:
|
||||||
|
return ''
|
||||||
|
container = SimpleContainer(tdir, opfpath, default_log)
|
||||||
|
for name, is_linear in container.spine_names:
|
||||||
|
texts.extend(to_text(container, name))
|
||||||
|
ans = '\n\n\n'.join(texts)
|
||||||
|
return unicodedata.normalize('NFC', ans)
|
@ -7,6 +7,7 @@ import sys
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from calibre.db.tests.base import BaseTest
|
from calibre.db.tests.base import BaseTest
|
||||||
|
from calibre.db.fts.text import html_to_text
|
||||||
|
|
||||||
|
|
||||||
def print(*args, **kwargs):
|
def print(*args, **kwargs):
|
||||||
@ -50,6 +51,18 @@ class FTSAPITest(BaseTest):
|
|||||||
fts.add_text(2, 'ADDED', 'data2')
|
fts.add_text(2, 'ADDED', 'data2')
|
||||||
self.ae(fts.all_currently_dirty(), [])
|
self.ae(fts.all_currently_dirty(), [])
|
||||||
|
|
||||||
|
def test_fts_to_text(self):
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import parse
|
||||||
|
html = '''
|
||||||
|
<html><body>
|
||||||
|
<div>first_para</div><p>second_para</p>
|
||||||
|
<p>some <i>itali</i>c t<!- c -->ext</p>
|
||||||
|
<div>nested<p>blocks</p></div>
|
||||||
|
</body></html>
|
||||||
|
'''
|
||||||
|
root = parse(html)
|
||||||
|
self.ae(tuple(html_to_text(root)), ('first_para\n\nsecond_para\n\nsome italic text\n\nnested\n\nblocks',))
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -10,6 +10,13 @@ from bisect import bisect
|
|||||||
from calibre import guess_type as _guess_type, replace_entities
|
from calibre import guess_type as _guess_type, replace_entities
|
||||||
|
|
||||||
|
|
||||||
|
BLOCK_TAG_NAMES = frozenset((
|
||||||
|
'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
|
||||||
|
'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
|
||||||
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
|
||||||
|
'ol', 'li', 'body', 'td', 'th'))
|
||||||
|
|
||||||
|
|
||||||
def guess_type(x):
|
def guess_type(x):
|
||||||
return _guess_type(x)[0] or 'application/octet-stream'
|
return _guess_type(x)[0] or 'application/octet-stream'
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ from qt.core import Qt, QTextCursor, QTextEdit
|
|||||||
from calibre import prepare_string_for_xml, xml_entity_to_unicode
|
from calibre import prepare_string_for_xml, xml_entity_to_unicode
|
||||||
from calibre.ebooks.oeb.base import css_text
|
from calibre.ebooks.oeb.base import css_text
|
||||||
from calibre.ebooks.oeb.polish.container import OEB_DOCS
|
from calibre.ebooks.oeb.polish.container import OEB_DOCS
|
||||||
|
from calibre.ebooks.oeb.polish.utils import BLOCK_TAG_NAMES
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
from calibre.gui2.tweak_book import current_container, tprefs
|
from calibre.gui2.tweak_book import current_container, tprefs
|
||||||
from calibre.gui2.tweak_book.editor.smarts import NullSmarts
|
from calibre.gui2.tweak_book.editor.smarts import NullSmarts
|
||||||
@ -284,13 +285,6 @@ def ensure_not_within_tag_definition(cursor, forward=True):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
BLOCK_TAG_NAMES = frozenset((
|
|
||||||
'address', 'article', 'aside', 'blockquote', 'center', 'dir', 'fieldset',
|
|
||||||
'isindex', 'menu', 'noframes', 'hgroup', 'noscript', 'pre', 'section',
|
|
||||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'p', 'div', 'dd', 'dl', 'ul',
|
|
||||||
'ol', 'li', 'body', 'td', 'th'))
|
|
||||||
|
|
||||||
|
|
||||||
def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES):
|
def find_closest_containing_block_tag(block, offset, block_tag_names=BLOCK_TAG_NAMES):
|
||||||
while True:
|
while True:
|
||||||
tag = find_closest_containing_tag(block, offset)
|
tag = find_closest_containing_tag(block, offset)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user