mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Function to unwrap marked sentences
This commit is contained in:
parent
6a5799edd9
commit
7e709b4fc7
@ -5,13 +5,13 @@
|
|||||||
import re
|
import re
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from css_selectors.select import Select, get_parsed_selector
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.metadata.tag_mapper import uniq
|
from calibre.ebooks.metadata.tag_mapper import uniq
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS, XPath
|
from calibre.ebooks.oeb.base import OEB_DOCS, XPath
|
||||||
from calibre.ebooks.oeb.parse_utils import XHTML
|
from calibre.ebooks.oeb.parse_utils import XHTML
|
||||||
|
from css_selectors.select import Select, get_parsed_selector
|
||||||
|
|
||||||
|
|
||||||
def non_empty_validator(label, val):
|
def non_empty_validator(label, val):
|
||||||
|
@ -197,7 +197,14 @@ class Structure(BaseTest):
|
|||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html
|
from calibre.ebooks.oeb.polish.tts import id_prefix, mark_sentences_in_html, unmark_sentences_in_html
|
||||||
|
|
||||||
|
def normalize_markup(root):
|
||||||
|
actual = html.tostring(root, encoding='unicode')
|
||||||
|
actual = actual[actual.find('<body'):]
|
||||||
|
actual = actual[:actual.rfind('</body>')]
|
||||||
|
return actual.replace(id_prefix, '')
|
||||||
|
|
||||||
for text, expected in reversed({
|
for text, expected in reversed({
|
||||||
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
|
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
|
||||||
|
|
||||||
@ -234,12 +241,12 @@ class Structure(BaseTest):
|
|||||||
'<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
|
'<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
|
||||||
}.items()):
|
}.items()):
|
||||||
root = parse(text, namespace_elements=True)
|
root = parse(text, namespace_elements=True)
|
||||||
|
orig = normalize_markup(root)
|
||||||
mark_sentences_in_html(root)
|
mark_sentences_in_html(root)
|
||||||
actual = html.tostring(root, encoding='unicode')
|
marked = normalize_markup(root)
|
||||||
actual = actual[actual.find('<body'):]
|
self.assertEqual(expected, marked)
|
||||||
actual = actual[:actual.rfind('</body>')]
|
unmark_sentences_in_html(root)
|
||||||
actual = actual.replace('cttsw-', '')
|
self.assertEqual(orig, normalize_markup(root), f'Unmarking failed for {marked}')
|
||||||
self.assertEqual(expected, actual)
|
|
||||||
sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
|
sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
|
||||||
self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))
|
self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from typing import NamedTuple
|
|||||||
from lxml.etree import ElementBase as Element
|
from lxml.etree import ElementBase as Element
|
||||||
from lxml.etree import tostring as _tostring
|
from lxml.etree import tostring as _tostring
|
||||||
|
|
||||||
|
from calibre.ebooks.html_transform_rules import unwrap_tag
|
||||||
from calibre.ebooks.oeb.base import barename
|
from calibre.ebooks.oeb.base import barename
|
||||||
from calibre.spell.break_iterator import sentence_positions
|
from calibre.spell.break_iterator import sentence_positions
|
||||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
@ -51,6 +52,14 @@ continued_tag_names = frozenset({
|
|||||||
ignored_tag_names = frozenset({
|
ignored_tag_names = frozenset({
|
||||||
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
||||||
})
|
})
|
||||||
|
id_prefix = 'cttsw-'
|
||||||
|
|
||||||
|
|
||||||
|
def unmark_sentences_in_html(root):
|
||||||
|
for x in root.xpath(f'//*[starts-with(@id, "{id_prefix}")]'):
|
||||||
|
x.attrib.pop('id')
|
||||||
|
if not x.attrib and x.tag and x.tag.endswith('span'):
|
||||||
|
unwrap_tag(x)
|
||||||
|
|
||||||
|
|
||||||
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
||||||
@ -106,7 +115,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
def make_into_wrapper(self, elem: Element) -> str:
|
def make_into_wrapper(self, elem: Element) -> str:
|
||||||
nonlocal id_counter
|
nonlocal id_counter
|
||||||
while True:
|
while True:
|
||||||
q = f'cttsw-{id_counter}'
|
q = f'{id_prefix}{id_counter}'
|
||||||
if q not in seen_ids:
|
if q not in seen_ids:
|
||||||
elem.set('id', q)
|
elem.set('id', q)
|
||||||
seen_ids.add(q)
|
seen_ids.add(q)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user