mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Function to unwrap marked sentences
This commit is contained in:
parent
6a5799edd9
commit
7e709b4fc7
@ -5,13 +5,13 @@
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
from css_selectors.select import Select, get_parsed_selector
|
||||
from html5_parser import parse
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.metadata.tag_mapper import uniq
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, XPath
|
||||
from calibre.ebooks.oeb.parse_utils import XHTML
|
||||
from css_selectors.select import Select, get_parsed_selector
|
||||
|
||||
|
||||
def non_empty_validator(label, val):
|
||||
|
@ -197,7 +197,14 @@ class Structure(BaseTest):
|
||||
from html5_parser import parse
|
||||
from lxml import html
|
||||
|
||||
from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html
|
||||
from calibre.ebooks.oeb.polish.tts import id_prefix, mark_sentences_in_html, unmark_sentences_in_html
|
||||
|
||||
def normalize_markup(root):
|
||||
actual = html.tostring(root, encoding='unicode')
|
||||
actual = actual[actual.find('<body'):]
|
||||
actual = actual[:actual.rfind('</body>')]
|
||||
return actual.replace(id_prefix, '')
|
||||
|
||||
for text, expected in reversed({
|
||||
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
|
||||
|
||||
@ -234,12 +241,12 @@ class Structure(BaseTest):
|
||||
'<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
|
||||
}.items()):
|
||||
root = parse(text, namespace_elements=True)
|
||||
orig = normalize_markup(root)
|
||||
mark_sentences_in_html(root)
|
||||
actual = html.tostring(root, encoding='unicode')
|
||||
actual = actual[actual.find('<body'):]
|
||||
actual = actual[:actual.rfind('</body>')]
|
||||
actual = actual.replace('cttsw-', '')
|
||||
self.assertEqual(expected, actual)
|
||||
marked = normalize_markup(root)
|
||||
self.assertEqual(expected, marked)
|
||||
unmark_sentences_in_html(root)
|
||||
self.assertEqual(orig, normalize_markup(root), f'Unmarking failed for {marked}')
|
||||
sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
|
||||
self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))
|
||||
|
||||
|
@ -9,6 +9,7 @@ from typing import NamedTuple
|
||||
from lxml.etree import ElementBase as Element
|
||||
from lxml.etree import tostring as _tostring
|
||||
|
||||
from calibre.ebooks.html_transform_rules import unwrap_tag
|
||||
from calibre.ebooks.oeb.base import barename
|
||||
from calibre.spell.break_iterator import sentence_positions
|
||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||
@ -51,6 +52,14 @@ continued_tag_names = frozenset({
|
||||
ignored_tag_names = frozenset({
|
||||
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
||||
})
|
||||
id_prefix = 'cttsw-'
|
||||
|
||||
|
||||
def unmark_sentences_in_html(root):
|
||||
for x in root.xpath(f'//*[starts-with(@id, "{id_prefix}")]'):
|
||||
x.attrib.pop('id')
|
||||
if not x.attrib and x.tag and x.tag.endswith('span'):
|
||||
unwrap_tag(x)
|
||||
|
||||
|
||||
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
||||
@ -106,7 +115,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
||||
def make_into_wrapper(self, elem: Element) -> str:
|
||||
nonlocal id_counter
|
||||
while True:
|
||||
q = f'cttsw-{id_counter}'
|
||||
q = f'{id_prefix}{id_counter}'
|
||||
if q not in seen_ids:
|
||||
elem.set('id', q)
|
||||
seen_ids.add(q)
|
||||
|
Loading…
x
Reference in New Issue
Block a user