Function to unwrap marked sentences

This commit is contained in:
Kovid Goyal 2024-10-15 16:53:58 +05:30
parent 6a5799edd9
commit 7e709b4fc7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 24 additions and 8 deletions

View File

@ -5,13 +5,13 @@
import re
from functools import partial
from css_selectors.select import Select, get_parsed_selector
from html5_parser import parse
from lxml import etree
from calibre.ebooks.metadata.tag_mapper import uniq
from calibre.ebooks.oeb.base import OEB_DOCS, XPath
from calibre.ebooks.oeb.parse_utils import XHTML
from css_selectors.select import Select, get_parsed_selector
def non_empty_validator(label, val):

View File

@ -197,7 +197,14 @@ class Structure(BaseTest):
from html5_parser import parse
from lxml import html
from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html
from calibre.ebooks.oeb.polish.tts import id_prefix, mark_sentences_in_html, unmark_sentences_in_html
def normalize_markup(root):
actual = html.tostring(root, encoding='unicode')
actual = actual[actual.find('<body'):]
actual = actual[:actual.rfind('</body>')]
return actual.replace(id_prefix, '')
for text, expected in reversed({
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
@ -234,12 +241,12 @@ class Structure(BaseTest):
'<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
}.items()):
root = parse(text, namespace_elements=True)
orig = normalize_markup(root)
mark_sentences_in_html(root)
actual = html.tostring(root, encoding='unicode')
actual = actual[actual.find('<body'):]
actual = actual[:actual.rfind('</body>')]
actual = actual.replace('cttsw-', '')
self.assertEqual(expected, actual)
marked = normalize_markup(root)
self.assertEqual(expected, marked)
unmark_sentences_in_html(root)
self.assertEqual(orig, normalize_markup(root), f'Unmarking failed for {marked}')
sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))

View File

@ -9,6 +9,7 @@ from typing import NamedTuple
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.html_transform_rules import unwrap_tag
from calibre.ebooks.oeb.base import barename
from calibre.spell.break_iterator import sentence_positions
from calibre.utils.localization import canonicalize_lang, get_lang
@ -51,6 +52,14 @@ continued_tag_names = frozenset({
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
id_prefix = 'cttsw-'
def unmark_sentences_in_html(root):
for x in root.xpath(f'//*[starts-with(@id, "{id_prefix}")]'):
x.attrib.pop('id')
if not x.attrib and x.tag and x.tag.endswith('span'):
unwrap_tag(x)
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
@ -106,7 +115,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
def make_into_wrapper(self, elem: Element) -> str:
nonlocal id_counter
while True:
q = f'cttsw-{id_counter}'
q = f'{id_prefix}{id_counter}'
if q not in seen_ids:
elem.set('id', q)
seen_ids.add(q)