mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Move sentence marking code into its own module
This commit is contained in:
parent
0796e12040
commit
690988033e
@ -193,6 +193,58 @@ class Structure(BaseTest):
|
|||||||
self.assertEqual('a.html', find_cover_page(c))
|
self.assertEqual('a.html', find_cover_page(c))
|
||||||
self.assertEqual('a.html', next(c.spine_names)[0])
|
self.assertEqual('a.html', next(c.spine_names)[0])
|
||||||
|
|
||||||
|
def test_mark_sentences(self):
|
||||||
|
from html5_parser import parse
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html
|
||||||
|
for text, expected in reversed({
|
||||||
|
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
|
||||||
|
|
||||||
|
'<p>hello <b>cruel</b> world': '<body><p><span id="1">hello <b>cruel</b> world</span></p>',
|
||||||
|
|
||||||
|
'<p>Yes, please. Hello <b>cruel</b> world.':
|
||||||
|
'<body><p><span id="1">Yes, please. </span><span id="2">Hello <b>cruel</b> world.</span></p>',
|
||||||
|
|
||||||
|
'<p>Hello <b>cruel</b> <i>world. </i>':
|
||||||
|
'<body><p><span id="1">Hello <b>cruel</b> <i>world. </i></span></p>',
|
||||||
|
|
||||||
|
'<p>Yes, <b>please.</b> Well done! Bravissima! ':
|
||||||
|
'<body><p><span id="1">Yes, <b>please.</b> </span><span id="2">Well done! </span><span id="3">Bravissima! </span></p>',
|
||||||
|
|
||||||
|
'<p>Yes, <b>please.</b> Well <i>done! </i>Bravissima! ':
|
||||||
|
'<body><p><span id="1">Yes, <b>please.</b> </span><span id="2">Well <i>done! </i></span><span id="3">Bravissima! </span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello</i>, world! Good day to you':
|
||||||
|
'<body><p><span id="1"><i>Hello</i>, world! </span><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello, world! </i>Good day to you':
|
||||||
|
'<body><p><i id="1">Hello, world! </i><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello, </i><b>world!</b>Good day to you':
|
||||||
|
'<body><p><span id="1"><i>Hello, </i><b>world!</b></span><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello, </i><b>world</b>! Good day to you':
|
||||||
|
'<body><p><span id="1"><i>Hello, </i><b>world</b>! </span><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p>Hello, <span lang="fr">world!':
|
||||||
|
'<body><p><span id="1">Hello, </span><span lang="fr"><span id="2">world!</span></span></p>',
|
||||||
|
|
||||||
|
'<p>Hello, <span data-calibre-tts="moose">world!':
|
||||||
|
'<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
|
||||||
|
}.items()):
|
||||||
|
root = parse(text, namespace_elements=True)
|
||||||
|
mark_sentences_in_html(root)
|
||||||
|
actual = html.tostring(root, encoding='unicode')
|
||||||
|
actual = actual[actual.find('<body'):]
|
||||||
|
actual = actual[:actual.rfind('</body>')]
|
||||||
|
actual = actual.replace('cttsw-', '')
|
||||||
|
self.assertEqual(expected, actual)
|
||||||
|
sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
|
||||||
|
self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
import unittest
|
import unittest
|
||||||
|
337
src/calibre/ebooks/oeb/polish/tts.py
Normal file
337
src/calibre/ebooks/oeb/polish/tts.py
Normal file
@ -0,0 +1,337 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from contextlib import suppress
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
|
from calibre.spell.break_iterator import sentence_positions
|
||||||
|
|
||||||
|
|
||||||
|
class Sentence(NamedTuple):
|
||||||
|
elem_id: str
|
||||||
|
text: str
|
||||||
|
lang: str
|
||||||
|
voice : str
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
||||||
|
import json
|
||||||
|
|
||||||
|
from lxml.etree import ElementBase as Element
|
||||||
|
from lxml.etree import tostring as _tostring
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import barename
|
||||||
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
|
continued_tag_names = frozenset({
|
||||||
|
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
|
||||||
|
})
|
||||||
|
ignored_tag_names = frozenset({
|
||||||
|
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
||||||
|
})
|
||||||
|
|
||||||
|
def tostring(x) -> str:
|
||||||
|
return _tostring(x, encoding='unicode')
|
||||||
|
|
||||||
|
def lang_for_elem(elem, parent_lang):
|
||||||
|
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
|
||||||
|
|
||||||
|
def has_text(elem):
|
||||||
|
if elem.text and elem.text.strip():
|
||||||
|
return True
|
||||||
|
for child in elem:
|
||||||
|
if child.tail and child.tail.strip():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
|
||||||
|
root_voice = voice
|
||||||
|
seen_ids = set(root.xpath('//*/@id'))
|
||||||
|
id_counter = 1
|
||||||
|
ans = []
|
||||||
|
clones_map = defaultdict(list)
|
||||||
|
|
||||||
|
class Chunk(NamedTuple):
|
||||||
|
child: Element | None
|
||||||
|
text: str
|
||||||
|
start_at: int
|
||||||
|
is_tail: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class Parent:
|
||||||
|
|
||||||
|
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):
|
||||||
|
self.elem = elem
|
||||||
|
self.tag_name = tag_name
|
||||||
|
self.lang = child_lang or lang_for_elem(elem, parent_lang)
|
||||||
|
q = elem.get('data-calibre-tts', '')
|
||||||
|
self.voice = parent_voice
|
||||||
|
if q.startswith('{'): # }
|
||||||
|
with suppress(Exception):
|
||||||
|
q = json.loads(q)
|
||||||
|
self.voice = q.get('voice') or parent_voice
|
||||||
|
else:
|
||||||
|
self.voice = q or parent_voice
|
||||||
|
self.pos = 0
|
||||||
|
self.texts = []
|
||||||
|
if elem.text and elem.text.strip():
|
||||||
|
self.texts.append(Chunk(None, elem.text, self.pos))
|
||||||
|
self.pos += len(elem.text)
|
||||||
|
self.children = tuple(elem.iterchildren())
|
||||||
|
self.child_pos = 0
|
||||||
|
|
||||||
|
def add_simple_child(self, elem):
|
||||||
|
if text := elem.text:
|
||||||
|
self.texts.append(Chunk(elem, text, self.pos))
|
||||||
|
self.pos += len(text)
|
||||||
|
|
||||||
|
def add_tail(self, elem, text):
|
||||||
|
self.texts.append(Chunk(elem, text, self.pos, is_tail=True))
|
||||||
|
self.pos += len(text)
|
||||||
|
|
||||||
|
def commit(self) -> None:
|
||||||
|
if not self.texts:
|
||||||
|
return
|
||||||
|
text = ''.join(c.text for c in self.texts)
|
||||||
|
self.pos = 0
|
||||||
|
for start, length in sentence_positions(text, self.lang):
|
||||||
|
elem_id = self.wrap_sentence(start, length)
|
||||||
|
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
||||||
|
self.texts = []
|
||||||
|
self.pos = 0
|
||||||
|
|
||||||
|
def make_into_wrapper(self, elem: Element) -> str:
|
||||||
|
nonlocal id_counter
|
||||||
|
while True:
|
||||||
|
q = f'cttsw-{id_counter}'
|
||||||
|
if q not in seen_ids:
|
||||||
|
elem.set('id', q)
|
||||||
|
seen_ids.add(q)
|
||||||
|
return q
|
||||||
|
id_counter += 1
|
||||||
|
|
||||||
|
def make_wrapper(self, text: str | None) -> Element:
|
||||||
|
ns, sep, _ = self.elem.tag.partition('}')
|
||||||
|
ans = self.elem.makeelement(ns + sep + 'span')
|
||||||
|
ans.text = text
|
||||||
|
self.make_into_wrapper(ans)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def replace_reference_to_child(self, elem: Element, replacement: Element) -> None:
|
||||||
|
for i in range(self.pos + 1, len(self.texts)):
|
||||||
|
if self.texts[i].child is elem:
|
||||||
|
self.texts[i] = self.texts[i]._replace(child=replacement)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
def wrap_contents(self, first_child: Element | None, last_child: Element) -> Element:
|
||||||
|
w = self.make_wrapper(self.elem.text if first_child is None else None)
|
||||||
|
in_range = False
|
||||||
|
for c in self.elem.iterchildren('*'):
|
||||||
|
if not in_range and (first_child is None or first_child is c):
|
||||||
|
in_range = True
|
||||||
|
pos = self.elem.index(c)
|
||||||
|
self.elem.insert(pos, w)
|
||||||
|
w.append(c)
|
||||||
|
first_child = c
|
||||||
|
if in_range:
|
||||||
|
if last_child is not first_child:
|
||||||
|
w.append(last_child)
|
||||||
|
if c is last_child:
|
||||||
|
break
|
||||||
|
self.replace_reference_to_child(last_child, w)
|
||||||
|
return w
|
||||||
|
|
||||||
|
def clone_simple_element(self, elem: Element) -> Element:
|
||||||
|
ans = elem.makeelement(elem.tag)
|
||||||
|
ans.attrib.update(elem.attrib)
|
||||||
|
ans.attrib.pop('id', None)
|
||||||
|
ans.attrib.pop('name', None)
|
||||||
|
ans.text, ans.tail = elem.text, elem.tail
|
||||||
|
p = elem.getparent()
|
||||||
|
idx = p.index(elem)
|
||||||
|
p.insert(idx + 1, ans)
|
||||||
|
self.replace_reference_to_child(elem, ans)
|
||||||
|
clones_map[elem].append(ans)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def wrap_sentence(self, start: int, length: int) -> str:
|
||||||
|
end = start + length
|
||||||
|
start_chunk = end_chunk = -1
|
||||||
|
start_offset = end_offset = 0
|
||||||
|
for i in range(self.pos, len(self.texts)):
|
||||||
|
c = self.texts[i]
|
||||||
|
if c.start_at <= start:
|
||||||
|
start_chunk = i
|
||||||
|
start_offset = start - c.start_at
|
||||||
|
if end <= c.start_at + len(c.text):
|
||||||
|
end_chunk = i
|
||||||
|
self.pos = i
|
||||||
|
end_offset = end - c.start_at
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.pos = end_chunk = len(self.texts) - 1
|
||||||
|
end_offset = len(self.texts[-1].text)
|
||||||
|
assert start_chunk > -1
|
||||||
|
s, e = self.texts[start_chunk], self.texts[end_chunk]
|
||||||
|
if s.child is None: # start in leading text of parent element
|
||||||
|
if e is s: # end also in leading text of parent element
|
||||||
|
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
||||||
|
self.elem.text = before
|
||||||
|
w = self.make_wrapper(sentence)
|
||||||
|
self.elem.insert(0, w)
|
||||||
|
w.tail = after
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
if e.is_tail: # ending in the tail of a child
|
||||||
|
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
included, after = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
e.child.tail = included
|
||||||
|
self.elem.text = after_start
|
||||||
|
w = self.wrap_contents(None, e.child)
|
||||||
|
w.tail = after
|
||||||
|
self.elem.text = before_start
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# ending inside a child
|
||||||
|
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
included, after = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
e.child.text = included
|
||||||
|
c = self.clone_simple_element(e.child)
|
||||||
|
c.text = after
|
||||||
|
e.child.tail = None
|
||||||
|
self.elem.text = after_start
|
||||||
|
w = self.wrap_contents(None, e.child)
|
||||||
|
self.elem.text = before_start
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(c, c.text, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# starting in a child text or tail
|
||||||
|
if s.is_tail:
|
||||||
|
if e.is_tail:
|
||||||
|
if s is e: # end in tail of same element
|
||||||
|
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
||||||
|
s.child.tail = before
|
||||||
|
w = self.make_wrapper(sentence)
|
||||||
|
w.tail = after
|
||||||
|
idx = self.elem.index(s.child)
|
||||||
|
self.elem.insert(idx + 1, w)
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
idx = self.elem.index(s.child)
|
||||||
|
w = self.wrap_contents(self.elem[idx+1], e.child)
|
||||||
|
w.text, w.tail = after_start, after_end
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# end inside some subsequent simple element
|
||||||
|
s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
c = self.clone_simple_element(e.child)
|
||||||
|
c.text = after_end
|
||||||
|
e.child.tail = None
|
||||||
|
w = self.wrap_contents(self.elem[self.elem.index(s.child) + 1], e.child)
|
||||||
|
w.text = after_start
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(c, after_end, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# start is in the text of a simple child
|
||||||
|
if s.child is e.child:
|
||||||
|
if e.is_tail: # ending in tail of element we start in
|
||||||
|
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
c = self.clone_simple_element(s.child)
|
||||||
|
s.child.text, s.child.tail = before_start, None
|
||||||
|
before_end, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
c.text, c.tail = after_start, before_end
|
||||||
|
w = self.wrap_contents(c, c)
|
||||||
|
w.tail = after_end
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# start and end in text of element
|
||||||
|
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
||||||
|
c = self.clone_simple_element(s.child)
|
||||||
|
s.child.text, s.child.tail = before, None
|
||||||
|
c.text, c.tail = sentence, None
|
||||||
|
c2 = self.clone_simple_element(c)
|
||||||
|
c2.text = after
|
||||||
|
self.make_into_wrapper(c)
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(c2, after, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return c.get('id')
|
||||||
|
# end is in a subsequent simple child or tail of one
|
||||||
|
s.child.text, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
c = self.clone_simple_element(s.child)
|
||||||
|
c.text, s.child.tail = after_start, None
|
||||||
|
if e.is_tail:
|
||||||
|
e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
w = self.wrap_contents(c, e.child)
|
||||||
|
w.tail = after_end
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# end is in text of subsequent simple child
|
||||||
|
e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
c2 = self.clone_simple_element(e.child)
|
||||||
|
c2.text, e.child.tail = after_end, None
|
||||||
|
w = self.wrap_contents(c, e.child)
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(c2, after_end, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
|
||||||
|
stack_of_parents = [Parent(elem, 'body', root_lang, root_voice) for elem in root.iterchildren('*') if barename(elem.tag).lower() == 'body']
|
||||||
|
while stack_of_parents:
|
||||||
|
p = stack_of_parents.pop()
|
||||||
|
if len(p.elem) == 1 and not has_text(p.elem): # wrapper
|
||||||
|
c = p.elem[0]
|
||||||
|
if isinstance(c.tag, str):
|
||||||
|
stack_of_parents.append(Parent(c, barename(c.tag).lower(), p.lang, p.voice))
|
||||||
|
continue
|
||||||
|
for i in range(p.child_pos, len(p.children)):
|
||||||
|
child = p.children[i]
|
||||||
|
child_voice = child.get('data-calibre-tts', '')
|
||||||
|
child_lang = lang_for_elem(child, p.lang)
|
||||||
|
child_tag_name = barename(child.tag).lower() if isinstance(child.tag, str) else ''
|
||||||
|
if child_lang == p.lang and child_voice == p.voice and child_tag_name in continued_tag_names and len(child) == 0:
|
||||||
|
p.add_simple_child(child)
|
||||||
|
elif child_tag_name not in ignored_tag_names:
|
||||||
|
stack_of_parents.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
|
||||||
|
p.commit()
|
||||||
|
p.child_pos = i + 1
|
||||||
|
stack_of_parents.append(p)
|
||||||
|
continue
|
||||||
|
if text := child.tail:
|
||||||
|
p.add_tail(child, text)
|
||||||
|
p.commit()
|
||||||
|
for src_elem, clones in clones_map.items():
|
||||||
|
for clone in clones + [src_elem]:
|
||||||
|
if not clone.text and not clone.tail and not clone.get('id') and not clone.get('name'):
|
||||||
|
if (p := clone.getparent()) is not None:
|
||||||
|
p.remove(clone)
|
||||||
|
return ans
|
@ -4,10 +4,7 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
from contextlib import suppress
|
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
from typing import NamedTuple
|
|
||||||
|
|
||||||
from calibre.utils.icu import _icu
|
from calibre.utils.icu import _icu
|
||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
@ -128,331 +125,3 @@ def split_into_sentences_for_tts(
|
|||||||
yield start, sentence
|
yield start, sentence
|
||||||
if pending_sentence:
|
if pending_sentence:
|
||||||
yield pending_start, pending_sentence
|
yield pending_start, pending_sentence
|
||||||
|
|
||||||
|
|
||||||
class Sentence(NamedTuple):
|
|
||||||
elem_id: str
|
|
||||||
text: str
|
|
||||||
lang: str
|
|
||||||
voice : str
|
|
||||||
|
|
||||||
|
|
||||||
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
|
||||||
import json
|
|
||||||
|
|
||||||
from lxml.etree import ElementBase as Element
|
|
||||||
from lxml.etree import tostring as _tostring
|
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import barename
|
|
||||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
|
||||||
continued_tag_names = frozenset({
|
|
||||||
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
|
|
||||||
})
|
|
||||||
ignored_tag_names = frozenset({
|
|
||||||
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
|
||||||
})
|
|
||||||
|
|
||||||
def tostring(x) -> str:
|
|
||||||
return _tostring(x, encoding='unicode')
|
|
||||||
|
|
||||||
def lang_for_elem(elem, parent_lang):
|
|
||||||
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
|
|
||||||
|
|
||||||
def has_text(elem):
|
|
||||||
if elem.text and elem.text.strip():
|
|
||||||
return True
|
|
||||||
for child in elem:
|
|
||||||
if child.tail and child.tail.strip():
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
|
|
||||||
root_voice = voice
|
|
||||||
seen_ids = set(root.xpath('//*/@id'))
|
|
||||||
id_counter = 1
|
|
||||||
ans = []
|
|
||||||
clones_map = defaultdict(list)
|
|
||||||
|
|
||||||
class Chunk(NamedTuple):
|
|
||||||
child: Element | None
|
|
||||||
text: str
|
|
||||||
start_at: int
|
|
||||||
is_tail: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class Parent:
|
|
||||||
|
|
||||||
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):
|
|
||||||
self.elem = elem
|
|
||||||
self.tag_name = tag_name
|
|
||||||
self.lang = child_lang or lang_for_elem(elem, parent_lang)
|
|
||||||
q = elem.get('data-calibre-tts', '')
|
|
||||||
self.voice = parent_voice
|
|
||||||
if q.startswith('{'): # }
|
|
||||||
with suppress(Exception):
|
|
||||||
q = json.loads(q)
|
|
||||||
self.voice = q.get('voice') or parent_voice
|
|
||||||
else:
|
|
||||||
self.voice = q or parent_voice
|
|
||||||
self.pos = 0
|
|
||||||
self.texts = []
|
|
||||||
if elem.text and elem.text.strip():
|
|
||||||
self.texts.append(Chunk(None, elem.text, self.pos))
|
|
||||||
self.pos += len(elem.text)
|
|
||||||
self.children = tuple(elem.iterchildren())
|
|
||||||
self.child_pos = 0
|
|
||||||
|
|
||||||
def add_simple_child(self, elem):
|
|
||||||
if text := elem.text:
|
|
||||||
self.texts.append(Chunk(elem, text, self.pos))
|
|
||||||
self.pos += len(text)
|
|
||||||
|
|
||||||
def add_tail(self, elem, text):
|
|
||||||
self.texts.append(Chunk(elem, text, self.pos, is_tail=True))
|
|
||||||
self.pos += len(text)
|
|
||||||
|
|
||||||
def commit(self) -> None:
|
|
||||||
if not self.texts:
|
|
||||||
return
|
|
||||||
text = ''.join(c.text for c in self.texts)
|
|
||||||
self.pos = 0
|
|
||||||
for start, length in sentence_positions(text, self.lang):
|
|
||||||
elem_id = self.wrap_sentence(start, length)
|
|
||||||
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
|
||||||
self.texts = []
|
|
||||||
self.pos = 0
|
|
||||||
|
|
||||||
def make_into_wrapper(self, elem: Element) -> str:
|
|
||||||
nonlocal id_counter
|
|
||||||
while True:
|
|
||||||
q = f'cttsw-{id_counter}'
|
|
||||||
if q not in seen_ids:
|
|
||||||
elem.set('id', q)
|
|
||||||
seen_ids.add(q)
|
|
||||||
return q
|
|
||||||
id_counter += 1
|
|
||||||
|
|
||||||
def make_wrapper(self, text: str | None) -> Element:
|
|
||||||
ns, sep, _ = self.elem.tag.partition('}')
|
|
||||||
ans = self.elem.makeelement(ns + sep + 'span')
|
|
||||||
ans.text = text
|
|
||||||
self.make_into_wrapper(ans)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def replace_reference_to_child(self, elem: Element, replacement: Element) -> None:
|
|
||||||
for i in range(self.pos + 1, len(self.texts)):
|
|
||||||
if self.texts[i].child is elem:
|
|
||||||
self.texts[i] = self.texts[i]._replace(child=replacement)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
def wrap_contents(self, first_child: Element | None, last_child: Element) -> Element:
|
|
||||||
w = self.make_wrapper(self.elem.text if first_child is None else None)
|
|
||||||
in_range = False
|
|
||||||
for c in self.elem.iterchildren('*'):
|
|
||||||
if not in_range and (first_child is None or first_child is c):
|
|
||||||
in_range = True
|
|
||||||
pos = self.elem.index(c)
|
|
||||||
self.elem.insert(pos, w)
|
|
||||||
w.append(c)
|
|
||||||
first_child = c
|
|
||||||
if in_range:
|
|
||||||
if last_child is not first_child:
|
|
||||||
w.append(last_child)
|
|
||||||
if c is last_child:
|
|
||||||
break
|
|
||||||
self.replace_reference_to_child(last_child, w)
|
|
||||||
return w
|
|
||||||
|
|
||||||
def clone_simple_element(self, elem: Element) -> Element:
|
|
||||||
ans = elem.makeelement(elem.tag)
|
|
||||||
ans.attrib.update(elem.attrib)
|
|
||||||
ans.attrib.pop('id', None)
|
|
||||||
ans.attrib.pop('name', None)
|
|
||||||
ans.text, ans.tail = elem.text, elem.tail
|
|
||||||
p = elem.getparent()
|
|
||||||
idx = p.index(elem)
|
|
||||||
p.insert(idx + 1, ans)
|
|
||||||
self.replace_reference_to_child(elem, ans)
|
|
||||||
clones_map[elem].append(ans)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def wrap_sentence(self, start: int, length: int) -> str:
|
|
||||||
end = start + length
|
|
||||||
start_chunk = end_chunk = -1
|
|
||||||
start_offset = end_offset = 0
|
|
||||||
for i in range(self.pos, len(self.texts)):
|
|
||||||
c = self.texts[i]
|
|
||||||
if c.start_at <= start:
|
|
||||||
start_chunk = i
|
|
||||||
start_offset = start - c.start_at
|
|
||||||
if end <= c.start_at + len(c.text):
|
|
||||||
end_chunk = i
|
|
||||||
self.pos = i
|
|
||||||
end_offset = end - c.start_at
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
self.pos = end_chunk = len(self.texts) - 1
|
|
||||||
end_offset = len(self.texts[-1].text)
|
|
||||||
assert start_chunk > -1
|
|
||||||
s, e = self.texts[start_chunk], self.texts[end_chunk]
|
|
||||||
if s.child is None: # start in leading text of parent element
|
|
||||||
if e is s: # end also in leading text of parent element
|
|
||||||
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
|
||||||
self.elem.text = before
|
|
||||||
w = self.make_wrapper(sentence)
|
|
||||||
self.elem.insert(0, w)
|
|
||||||
w.tail = after
|
|
||||||
if after:
|
|
||||||
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
if e.is_tail: # ending in the tail of a child
|
|
||||||
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
|
||||||
included, after = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
e.child.tail = included
|
|
||||||
self.elem.text = after_start
|
|
||||||
w = self.wrap_contents(None, e.child)
|
|
||||||
w.tail = after
|
|
||||||
self.elem.text = before_start
|
|
||||||
if after:
|
|
||||||
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
# ending inside a child
|
|
||||||
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
|
||||||
included, after = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
e.child.text = included
|
|
||||||
c = self.clone_simple_element(e.child)
|
|
||||||
c.text = after
|
|
||||||
e.child.tail = None
|
|
||||||
self.elem.text = after_start
|
|
||||||
w = self.wrap_contents(None, e.child)
|
|
||||||
self.elem.text = before_start
|
|
||||||
if after:
|
|
||||||
self.texts[self.pos] = Chunk(c, c.text, end)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
# starting in a child text or tail
|
|
||||||
if s.is_tail:
|
|
||||||
if e.is_tail:
|
|
||||||
if s is e: # end in tail of same element
|
|
||||||
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
|
||||||
s.child.tail = before
|
|
||||||
w = self.make_wrapper(sentence)
|
|
||||||
w.tail = after
|
|
||||||
idx = self.elem.index(s.child)
|
|
||||||
self.elem.insert(idx + 1, w)
|
|
||||||
if after:
|
|
||||||
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
|
|
||||||
e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
idx = self.elem.index(s.child)
|
|
||||||
w = self.wrap_contents(self.elem[idx+1], e.child)
|
|
||||||
w.text, w.tail = after_start, after_end
|
|
||||||
if after_end:
|
|
||||||
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
# end inside some subsequent simple element
|
|
||||||
s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
|
|
||||||
e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
c = self.clone_simple_element(e.child)
|
|
||||||
c.text = after_end
|
|
||||||
e.child.tail = None
|
|
||||||
w = self.wrap_contents(self.elem[self.elem.index(s.child) + 1], e.child)
|
|
||||||
w.text = after_start
|
|
||||||
if after_end:
|
|
||||||
self.texts[self.pos] = Chunk(c, after_end, end)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
# start is in the text of a simple child
|
|
||||||
if s.child is e.child:
|
|
||||||
if e.is_tail: # ending in tail of element we start in
|
|
||||||
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
|
||||||
c = self.clone_simple_element(s.child)
|
|
||||||
s.child.text, s.child.tail = before_start, None
|
|
||||||
before_end, after_end = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
c.text, c.tail = after_start, before_end
|
|
||||||
w = self.wrap_contents(c, c)
|
|
||||||
w.tail = after_end
|
|
||||||
if after_end:
|
|
||||||
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
# start and end in text of element
|
|
||||||
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
|
||||||
c = self.clone_simple_element(s.child)
|
|
||||||
s.child.text, s.child.tail = before, None
|
|
||||||
c.text, c.tail = sentence, None
|
|
||||||
c2 = self.clone_simple_element(c)
|
|
||||||
c2.text = after
|
|
||||||
self.make_into_wrapper(c)
|
|
||||||
if after:
|
|
||||||
self.texts[self.pos] = Chunk(c2, after, end)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return c.get('id')
|
|
||||||
# end is in a subsequent simple child or tail of one
|
|
||||||
s.child.text, after_start = s.text[:start_offset], s.text[start_offset:]
|
|
||||||
c = self.clone_simple_element(s.child)
|
|
||||||
c.text, s.child.tail = after_start, None
|
|
||||||
if e.is_tail:
|
|
||||||
e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
w = self.wrap_contents(c, e.child)
|
|
||||||
w.tail = after_end
|
|
||||||
if after_end:
|
|
||||||
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
# end is in text of subsequent simple child
|
|
||||||
e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
|
|
||||||
c2 = self.clone_simple_element(e.child)
|
|
||||||
c2.text, e.child.tail = after_end, None
|
|
||||||
w = self.wrap_contents(c, e.child)
|
|
||||||
if after_end:
|
|
||||||
self.texts[self.pos] = Chunk(c2, after_end, end)
|
|
||||||
else:
|
|
||||||
self.pos += 1
|
|
||||||
return w.get('id')
|
|
||||||
|
|
||||||
stack_of_parents = [Parent(elem, 'body', root_lang, root_voice) for elem in root.iterchildren('*') if barename(elem.tag).lower() == 'body']
|
|
||||||
while stack_of_parents:
|
|
||||||
p = stack_of_parents.pop()
|
|
||||||
if len(p.elem) == 1 and not has_text(p.elem): # wrapper
|
|
||||||
c = p.elem[0]
|
|
||||||
if isinstance(c.tag, str):
|
|
||||||
stack_of_parents.append(Parent(c, barename(c.tag).lower(), p.lang, p.voice))
|
|
||||||
continue
|
|
||||||
for i in range(p.child_pos, len(p.children)):
|
|
||||||
child = p.children[i]
|
|
||||||
child_voice = child.get('data-calibre-tts', '')
|
|
||||||
child_lang = lang_for_elem(child, p.lang)
|
|
||||||
child_tag_name = barename(child.tag).lower() if isinstance(child.tag, str) else ''
|
|
||||||
if child_lang == p.lang and child_voice == p.voice and child_tag_name in continued_tag_names and len(child) == 0:
|
|
||||||
p.add_simple_child(child)
|
|
||||||
elif child_tag_name not in ignored_tag_names:
|
|
||||||
stack_of_parents.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
|
|
||||||
p.commit()
|
|
||||||
p.child_pos = i + 1
|
|
||||||
stack_of_parents.append(p)
|
|
||||||
continue
|
|
||||||
if text := child.tail:
|
|
||||||
p.add_tail(child, text)
|
|
||||||
p.commit()
|
|
||||||
for src_elem, clones in clones_map.items():
|
|
||||||
for clone in clones + [src_elem]:
|
|
||||||
if not clone.text and not clone.tail and not clone.get('id') and not clone.get('name'):
|
|
||||||
if (p := clone.getparent()) is not None:
|
|
||||||
p.remove(clone)
|
|
||||||
return ans
|
|
||||||
|
@ -267,57 +267,6 @@ class TestICU(unittest.TestCase):
|
|||||||
}.items():
|
}.items():
|
||||||
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
||||||
|
|
||||||
def test_mark_sentences(self):
|
|
||||||
from html5_parser import parse
|
|
||||||
from lxml import html
|
|
||||||
|
|
||||||
from calibre.spell.break_iterator import mark_sentences_in_html
|
|
||||||
for text, expected in reversed({
|
|
||||||
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
|
|
||||||
|
|
||||||
'<p>hello <b>cruel</b> world': '<body><p><span id="1">hello <b>cruel</b> world</span></p>',
|
|
||||||
|
|
||||||
'<p>Yes, please. Hello <b>cruel</b> world.':
|
|
||||||
'<body><p><span id="1">Yes, please. </span><span id="2">Hello <b>cruel</b> world.</span></p>',
|
|
||||||
|
|
||||||
'<p>Hello <b>cruel</b> <i>world. </i>':
|
|
||||||
'<body><p><span id="1">Hello <b>cruel</b> <i>world. </i></span></p>',
|
|
||||||
|
|
||||||
'<p>Yes, <b>please.</b> Well done! Bravissima! ':
|
|
||||||
'<body><p><span id="1">Yes, <b>please.</b> </span><span id="2">Well done! </span><span id="3">Bravissima! </span></p>',
|
|
||||||
|
|
||||||
'<p>Yes, <b>please.</b> Well <i>done! </i>Bravissima! ':
|
|
||||||
'<body><p><span id="1">Yes, <b>please.</b> </span><span id="2">Well <i>done! </i></span><span id="3">Bravissima! </span></p>',
|
|
||||||
|
|
||||||
'<p><i>Hello</i>, world! Good day to you':
|
|
||||||
'<body><p><span id="1"><i>Hello</i>, world! </span><span id="2">Good day to you</span></p>',
|
|
||||||
|
|
||||||
'<p><i>Hello, world! </i>Good day to you':
|
|
||||||
'<body><p><i id="1">Hello, world! </i><span id="2">Good day to you</span></p>',
|
|
||||||
|
|
||||||
'<p><i>Hello, </i><b>world!</b>Good day to you':
|
|
||||||
'<body><p><span id="1"><i>Hello, </i><b>world!</b></span><span id="2">Good day to you</span></p>',
|
|
||||||
|
|
||||||
'<p><i>Hello, </i><b>world</b>! Good day to you':
|
|
||||||
'<body><p><span id="1"><i>Hello, </i><b>world</b>! </span><span id="2">Good day to you</span></p>',
|
|
||||||
|
|
||||||
'<p>Hello, <span lang="fr">world!':
|
|
||||||
'<body><p><span id="1">Hello, </span><span lang="fr"><span id="2">world!</span></span></p>',
|
|
||||||
|
|
||||||
'<p>Hello, <span data-calibre-tts="moose">world!':
|
|
||||||
'<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
|
|
||||||
}.items()):
|
|
||||||
root = parse(text, namespace_elements=True)
|
|
||||||
mark_sentences_in_html(root)
|
|
||||||
actual = html.tostring(root, encoding='unicode')
|
|
||||||
actual = actual[actual.find('<body'):]
|
|
||||||
actual = actual[:actual.rfind('</body>')]
|
|
||||||
actual = actual.replace('cttsw-', '')
|
|
||||||
self.ae(expected, actual)
|
|
||||||
sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
|
|
||||||
self.ae(tuple(s.lang for s in sentences), ('eng', 'fra'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
|
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user