diff --git a/src/calibre/ebooks/oeb/polish/tests/structure.py b/src/calibre/ebooks/oeb/polish/tests/structure.py
index 4b072d28d2..491abe781b 100644
--- a/src/calibre/ebooks/oeb/polish/tests/structure.py
+++ b/src/calibre/ebooks/oeb/polish/tests/structure.py
@@ -193,6 +193,58 @@ class Structure(BaseTest):
self.assertEqual('a.html', find_cover_page(c))
self.assertEqual('a.html', next(c.spine_names)[0])
+ def test_mark_sentences(self):
+ from html5_parser import parse
+ from lxml import html
+
+ from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html
+ for text, expected in reversed({
+ '
hello cruel world': '
hello cruel world
',
+
+ 'hello cruel world': '
hello cruel world
',
+
+ 'Yes, please. Hello cruel world.':
+ '
Yes, please. Hello cruel world.
',
+
+ 'Hello cruel world. ':
+ '
Hello cruel world.
',
+
+ 'Yes, please. Well done! Bravissima! ':
+ '
Yes, please. Well done! Bravissima!
',
+
+ 'Yes, please. Well done! Bravissima! ':
+ '
Yes, please. Well done! Bravissima!
',
+
+ 'Hello, world! Good day to you':
+ '
Hello, world! Good day to you
',
+
+ 'Hello, world! Good day to you':
+ '
Hello, world! Good day to you
',
+
+ 'Hello, world!Good day to you':
+ '
Hello, world!Good day to you
',
+
+ 'Hello, world! Good day to you':
+ '
Hello, world! Good day to you
',
+
+ 'Hello, world!':
+ 'Hello, world!
',
+
+ 'Hello, world!':
+ 'Hello, world!
',
+ }.items()):
+ root = parse(text, namespace_elements=True)
+ mark_sentences_in_html(root)
+ actual = html.tostring(root, encoding='unicode')
+ actual = actual[actual.find('')]
+ actual = actual.replace('cttsw-', '')
+ self.assertEqual(expected, actual)
+ sentences = mark_sentences_in_html(parse('Hello, world!'))
+ self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))
+
+
+
def find_tests():
import unittest
diff --git a/src/calibre/ebooks/oeb/polish/tts.py b/src/calibre/ebooks/oeb/polish/tts.py
new file mode 100644
index 0000000000..8571e6e6d1
--- /dev/null
+++ b/src/calibre/ebooks/oeb/polish/tts.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2024, Kovid Goyal
+
+from collections import defaultdict
+from contextlib import suppress
+from typing import NamedTuple
+
+from calibre.spell.break_iterator import sentence_positions
+
+
+class Sentence(NamedTuple):
+ elem_id: str
+ text: str
+ lang: str
+ voice : str
+
+
+
+def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
+ import json
+
+ from lxml.etree import ElementBase as Element
+ from lxml.etree import tostring as _tostring
+
+ from calibre.ebooks.oeb.base import barename
+ from calibre.utils.localization import canonicalize_lang, get_lang
+ continued_tag_names = frozenset({
+ 'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
+ })
+ ignored_tag_names = frozenset({
+ 'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
+ })
+
+ def tostring(x) -> str:
+ return _tostring(x, encoding='unicode')
+
+ def lang_for_elem(elem, parent_lang):
+ return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
+
+ def has_text(elem):
+ if elem.text and elem.text.strip():
+ return True
+ for child in elem:
+ if child.tail and child.tail.strip():
+ return True
+ return False
+
+ root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
+ root_voice = voice
+ seen_ids = set(root.xpath('//*/@id'))
+ id_counter = 1
+ ans = []
+ clones_map = defaultdict(list)
+
+ class Chunk(NamedTuple):
+ child: Element | None
+ text: str
+ start_at: int
+ is_tail: bool = False
+
+
+ class Parent:
+
+ def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):
+ self.elem = elem
+ self.tag_name = tag_name
+ self.lang = child_lang or lang_for_elem(elem, parent_lang)
+ q = elem.get('data-calibre-tts', '')
+ self.voice = parent_voice
+ if q.startswith('{'): # }
+ with suppress(Exception):
+ q = json.loads(q)
+ self.voice = q.get('voice') or parent_voice
+ else:
+ self.voice = q or parent_voice
+ self.pos = 0
+ self.texts = []
+ if elem.text and elem.text.strip():
+ self.texts.append(Chunk(None, elem.text, self.pos))
+ self.pos += len(elem.text)
+ self.children = tuple(elem.iterchildren())
+ self.child_pos = 0
+
+ def add_simple_child(self, elem):
+ if text := elem.text:
+ self.texts.append(Chunk(elem, text, self.pos))
+ self.pos += len(text)
+
+ def add_tail(self, elem, text):
+ self.texts.append(Chunk(elem, text, self.pos, is_tail=True))
+ self.pos += len(text)
+
+ def commit(self) -> None:
+ if not self.texts:
+ return
+ text = ''.join(c.text for c in self.texts)
+ self.pos = 0
+ for start, length in sentence_positions(text, self.lang):
+ elem_id = self.wrap_sentence(start, length)
+ ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
+ self.texts = []
+ self.pos = 0
+
+ def make_into_wrapper(self, elem: Element) -> str:
+ nonlocal id_counter
+ while True:
+ q = f'cttsw-{id_counter}'
+ if q not in seen_ids:
+ elem.set('id', q)
+ seen_ids.add(q)
+ return q
+ id_counter += 1
+
+ def make_wrapper(self, text: str | None) -> Element:
+ ns, sep, _ = self.elem.tag.partition('}')
+ ans = self.elem.makeelement(ns + sep + 'span')
+ ans.text = text
+ self.make_into_wrapper(ans)
+ return ans
+
+ def replace_reference_to_child(self, elem: Element, replacement: Element) -> None:
+ for i in range(self.pos + 1, len(self.texts)):
+ if self.texts[i].child is elem:
+ self.texts[i] = self.texts[i]._replace(child=replacement)
+ else:
+ break
+
+ def wrap_contents(self, first_child: Element | None, last_child: Element) -> Element:
+ w = self.make_wrapper(self.elem.text if first_child is None else None)
+ in_range = False
+ for c in self.elem.iterchildren('*'):
+ if not in_range and (first_child is None or first_child is c):
+ in_range = True
+ pos = self.elem.index(c)
+ self.elem.insert(pos, w)
+ w.append(c)
+ first_child = c
+ if in_range:
+ if last_child is not first_child:
+ w.append(last_child)
+ if c is last_child:
+ break
+ self.replace_reference_to_child(last_child, w)
+ return w
+
+ def clone_simple_element(self, elem: Element) -> Element:
+ ans = elem.makeelement(elem.tag)
+ ans.attrib.update(elem.attrib)
+ ans.attrib.pop('id', None)
+ ans.attrib.pop('name', None)
+ ans.text, ans.tail = elem.text, elem.tail
+ p = elem.getparent()
+ idx = p.index(elem)
+ p.insert(idx + 1, ans)
+ self.replace_reference_to_child(elem, ans)
+ clones_map[elem].append(ans)
+ return ans
+
+ def wrap_sentence(self, start: int, length: int) -> str:
+ end = start + length
+ start_chunk = end_chunk = -1
+ start_offset = end_offset = 0
+ for i in range(self.pos, len(self.texts)):
+ c = self.texts[i]
+ if c.start_at <= start:
+ start_chunk = i
+ start_offset = start - c.start_at
+ if end <= c.start_at + len(c.text):
+ end_chunk = i
+ self.pos = i
+ end_offset = end - c.start_at
+ break
+ else:
+ self.pos = end_chunk = len(self.texts) - 1
+ end_offset = len(self.texts[-1].text)
+ assert start_chunk > -1
+ s, e = self.texts[start_chunk], self.texts[end_chunk]
+ if s.child is None: # start in leading text of parent element
+ if e is s: # end also in leading text of parent element
+ before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
+ self.elem.text = before
+ w = self.make_wrapper(sentence)
+ self.elem.insert(0, w)
+ w.tail = after
+ if after:
+ self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
+ else:
+ self.pos += 1
+ return w.get('id')
+ if e.is_tail: # ending in the tail of a child
+ before_start, after_start = s.text[:start_offset], s.text[start_offset:]
+ included, after = e.text[:end_offset], e.text[end_offset:]
+ e.child.tail = included
+ self.elem.text = after_start
+ w = self.wrap_contents(None, e.child)
+ w.tail = after
+ self.elem.text = before_start
+ if after:
+ self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
+ else:
+ self.pos += 1
+ return w.get('id')
+ # ending inside a child
+ before_start, after_start = s.text[:start_offset], s.text[start_offset:]
+ included, after = e.text[:end_offset], e.text[end_offset:]
+ e.child.text = included
+ c = self.clone_simple_element(e.child)
+ c.text = after
+ e.child.tail = None
+ self.elem.text = after_start
+ w = self.wrap_contents(None, e.child)
+ self.elem.text = before_start
+ if after:
+ self.texts[self.pos] = Chunk(c, c.text, end)
+ else:
+ self.pos += 1
+ return w.get('id')
+ # starting in a child text or tail
+ if s.is_tail:
+ if e.is_tail:
+ if s is e: # end in tail of same element
+ before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
+ s.child.tail = before
+ w = self.make_wrapper(sentence)
+ w.tail = after
+ idx = self.elem.index(s.child)
+ self.elem.insert(idx + 1, w)
+ if after:
+ self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
+ else:
+ self.pos += 1
+ return w.get('id')
+ s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
+ e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
+ idx = self.elem.index(s.child)
+ w = self.wrap_contents(self.elem[idx+1], e.child)
+ w.text, w.tail = after_start, after_end
+ if after_end:
+ self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
+ else:
+ self.pos += 1
+ return w.get('id')
+ # end inside some subsequent simple element
+ s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
+ e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
+ c = self.clone_simple_element(e.child)
+ c.text = after_end
+ e.child.tail = None
+ w = self.wrap_contents(self.elem[self.elem.index(s.child) + 1], e.child)
+ w.text = after_start
+ if after_end:
+ self.texts[self.pos] = Chunk(c, after_end, end)
+ else:
+ self.pos += 1
+ return w.get('id')
+ # start is in the text of a simple child
+ if s.child is e.child:
+ if e.is_tail: # ending in tail of element we start in
+ before_start, after_start = s.text[:start_offset], s.text[start_offset:]
+ c = self.clone_simple_element(s.child)
+ s.child.text, s.child.tail = before_start, None
+ before_end, after_end = e.text[:end_offset], e.text[end_offset:]
+ c.text, c.tail = after_start, before_end
+ w = self.wrap_contents(c, c)
+ w.tail = after_end
+ if after_end:
+ self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
+ else:
+ self.pos += 1
+ return w.get('id')
+ # start and end in text of element
+ before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
+ c = self.clone_simple_element(s.child)
+ s.child.text, s.child.tail = before, None
+ c.text, c.tail = sentence, None
+ c2 = self.clone_simple_element(c)
+ c2.text = after
+ self.make_into_wrapper(c)
+ if after:
+ self.texts[self.pos] = Chunk(c2, after, end)
+ else:
+ self.pos += 1
+ return c.get('id')
+ # end is in a subsequent simple child or tail of one
+ s.child.text, after_start = s.text[:start_offset], s.text[start_offset:]
+ c = self.clone_simple_element(s.child)
+ c.text, s.child.tail = after_start, None
+ if e.is_tail:
+ e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
+ w = self.wrap_contents(c, e.child)
+ w.tail = after_end
+ if after_end:
+ self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
+ else:
+ self.pos += 1
+ return w.get('id')
+ # end is in text of subsequent simple child
+ e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
+ c2 = self.clone_simple_element(e.child)
+ c2.text, e.child.tail = after_end, None
+ w = self.wrap_contents(c, e.child)
+ if after_end:
+ self.texts[self.pos] = Chunk(c2, after_end, end)
+ else:
+ self.pos += 1
+ return w.get('id')
+
+ stack_of_parents = [Parent(elem, 'body', root_lang, root_voice) for elem in root.iterchildren('*') if barename(elem.tag).lower() == 'body']
+ while stack_of_parents:
+ p = stack_of_parents.pop()
+ if len(p.elem) == 1 and not has_text(p.elem): # wrapper
+ c = p.elem[0]
+ if isinstance(c.tag, str):
+ stack_of_parents.append(Parent(c, barename(c.tag).lower(), p.lang, p.voice))
+ continue
+ for i in range(p.child_pos, len(p.children)):
+ child = p.children[i]
+ child_voice = child.get('data-calibre-tts', '')
+ child_lang = lang_for_elem(child, p.lang)
+ child_tag_name = barename(child.tag).lower() if isinstance(child.tag, str) else ''
+ if child_lang == p.lang and child_voice == p.voice and child_tag_name in continued_tag_names and len(child) == 0:
+ p.add_simple_child(child)
+ elif child_tag_name not in ignored_tag_names:
+ stack_of_parents.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
+ p.commit()
+ p.child_pos = i + 1
+ stack_of_parents.append(p)
+ continue
+ if text := child.tail:
+ p.add_tail(child, text)
+ p.commit()
+ for src_elem, clones in clones_map.items():
+ for clone in clones + [src_elem]:
+ if not clone.text and not clone.tail and not clone.get('id') and not clone.get('name'):
+ if (p := clone.getparent()) is not None:
+ p.remove(clone)
+ return ans
diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py
index 452417f278..cba9b3cf8c 100644
--- a/src/calibre/spell/break_iterator.py
+++ b/src/calibre/spell/break_iterator.py
@@ -4,10 +4,7 @@
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal '
-from collections import defaultdict
-from contextlib import suppress
from threading import Lock
-from typing import NamedTuple
from calibre.utils.icu import _icu
from calibre.utils.localization import lang_as_iso639_1
@@ -128,331 +125,3 @@ def split_into_sentences_for_tts(
yield start, sentence
if pending_sentence:
yield pending_start, pending_sentence
-
-
-class Sentence(NamedTuple):
- elem_id: str
- text: str
- lang: str
- voice : str
-
-
-def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
- import json
-
- from lxml.etree import ElementBase as Element
- from lxml.etree import tostring as _tostring
-
- from calibre.ebooks.oeb.base import barename
- from calibre.utils.localization import canonicalize_lang, get_lang
- continued_tag_names = frozenset({
- 'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
- })
- ignored_tag_names = frozenset({
- 'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
- })
-
- def tostring(x) -> str:
- return _tostring(x, encoding='unicode')
-
- def lang_for_elem(elem, parent_lang):
- return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
-
- def has_text(elem):
- if elem.text and elem.text.strip():
- return True
- for child in elem:
- if child.tail and child.tail.strip():
- return True
- return False
-
- root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
- root_voice = voice
- seen_ids = set(root.xpath('//*/@id'))
- id_counter = 1
- ans = []
- clones_map = defaultdict(list)
-
- class Chunk(NamedTuple):
- child: Element | None
- text: str
- start_at: int
- is_tail: bool = False
-
-
- class Parent:
-
- def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):
- self.elem = elem
- self.tag_name = tag_name
- self.lang = child_lang or lang_for_elem(elem, parent_lang)
- q = elem.get('data-calibre-tts', '')
- self.voice = parent_voice
- if q.startswith('{'): # }
- with suppress(Exception):
- q = json.loads(q)
- self.voice = q.get('voice') or parent_voice
- else:
- self.voice = q or parent_voice
- self.pos = 0
- self.texts = []
- if elem.text and elem.text.strip():
- self.texts.append(Chunk(None, elem.text, self.pos))
- self.pos += len(elem.text)
- self.children = tuple(elem.iterchildren())
- self.child_pos = 0
-
- def add_simple_child(self, elem):
- if text := elem.text:
- self.texts.append(Chunk(elem, text, self.pos))
- self.pos += len(text)
-
- def add_tail(self, elem, text):
- self.texts.append(Chunk(elem, text, self.pos, is_tail=True))
- self.pos += len(text)
-
- def commit(self) -> None:
- if not self.texts:
- return
- text = ''.join(c.text for c in self.texts)
- self.pos = 0
- for start, length in sentence_positions(text, self.lang):
- elem_id = self.wrap_sentence(start, length)
- ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
- self.texts = []
- self.pos = 0
-
- def make_into_wrapper(self, elem: Element) -> str:
- nonlocal id_counter
- while True:
- q = f'cttsw-{id_counter}'
- if q not in seen_ids:
- elem.set('id', q)
- seen_ids.add(q)
- return q
- id_counter += 1
-
- def make_wrapper(self, text: str | None) -> Element:
- ns, sep, _ = self.elem.tag.partition('}')
- ans = self.elem.makeelement(ns + sep + 'span')
- ans.text = text
- self.make_into_wrapper(ans)
- return ans
-
- def replace_reference_to_child(self, elem: Element, replacement: Element) -> None:
- for i in range(self.pos + 1, len(self.texts)):
- if self.texts[i].child is elem:
- self.texts[i] = self.texts[i]._replace(child=replacement)
- else:
- break
-
- def wrap_contents(self, first_child: Element | None, last_child: Element) -> Element:
- w = self.make_wrapper(self.elem.text if first_child is None else None)
- in_range = False
- for c in self.elem.iterchildren('*'):
- if not in_range and (first_child is None or first_child is c):
- in_range = True
- pos = self.elem.index(c)
- self.elem.insert(pos, w)
- w.append(c)
- first_child = c
- if in_range:
- if last_child is not first_child:
- w.append(last_child)
- if c is last_child:
- break
- self.replace_reference_to_child(last_child, w)
- return w
-
- def clone_simple_element(self, elem: Element) -> Element:
- ans = elem.makeelement(elem.tag)
- ans.attrib.update(elem.attrib)
- ans.attrib.pop('id', None)
- ans.attrib.pop('name', None)
- ans.text, ans.tail = elem.text, elem.tail
- p = elem.getparent()
- idx = p.index(elem)
- p.insert(idx + 1, ans)
- self.replace_reference_to_child(elem, ans)
- clones_map[elem].append(ans)
- return ans
-
- def wrap_sentence(self, start: int, length: int) -> str:
- end = start + length
- start_chunk = end_chunk = -1
- start_offset = end_offset = 0
- for i in range(self.pos, len(self.texts)):
- c = self.texts[i]
- if c.start_at <= start:
- start_chunk = i
- start_offset = start - c.start_at
- if end <= c.start_at + len(c.text):
- end_chunk = i
- self.pos = i
- end_offset = end - c.start_at
- break
- else:
- self.pos = end_chunk = len(self.texts) - 1
- end_offset = len(self.texts[-1].text)
- assert start_chunk > -1
- s, e = self.texts[start_chunk], self.texts[end_chunk]
- if s.child is None: # start in leading text of parent element
- if e is s: # end also in leading text of parent element
- before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
- self.elem.text = before
- w = self.make_wrapper(sentence)
- self.elem.insert(0, w)
- w.tail = after
- if after:
- self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
- else:
- self.pos += 1
- return w.get('id')
- if e.is_tail: # ending in the tail of a child
- before_start, after_start = s.text[:start_offset], s.text[start_offset:]
- included, after = e.text[:end_offset], e.text[end_offset:]
- e.child.tail = included
- self.elem.text = after_start
- w = self.wrap_contents(None, e.child)
- w.tail = after
- self.elem.text = before_start
- if after:
- self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
- else:
- self.pos += 1
- return w.get('id')
- # ending inside a child
- before_start, after_start = s.text[:start_offset], s.text[start_offset:]
- included, after = e.text[:end_offset], e.text[end_offset:]
- e.child.text = included
- c = self.clone_simple_element(e.child)
- c.text = after
- e.child.tail = None
- self.elem.text = after_start
- w = self.wrap_contents(None, e.child)
- self.elem.text = before_start
- if after:
- self.texts[self.pos] = Chunk(c, c.text, end)
- else:
- self.pos += 1
- return w.get('id')
- # starting in a child text or tail
- if s.is_tail:
- if e.is_tail:
- if s is e: # end in tail of same element
- before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
- s.child.tail = before
- w = self.make_wrapper(sentence)
- w.tail = after
- idx = self.elem.index(s.child)
- self.elem.insert(idx + 1, w)
- if after:
- self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
- else:
- self.pos += 1
- return w.get('id')
- s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
- e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
- idx = self.elem.index(s.child)
- w = self.wrap_contents(self.elem[idx+1], e.child)
- w.text, w.tail = after_start, after_end
- if after_end:
- self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
- else:
- self.pos += 1
- return w.get('id')
- # end inside some subsequent simple element
- s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
- e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
- c = self.clone_simple_element(e.child)
- c.text = after_end
- e.child.tail = None
- w = self.wrap_contents(self.elem[self.elem.index(s.child) + 1], e.child)
- w.text = after_start
- if after_end:
- self.texts[self.pos] = Chunk(c, after_end, end)
- else:
- self.pos += 1
- return w.get('id')
- # start is in the text of a simple child
- if s.child is e.child:
- if e.is_tail: # ending in tail of element we start in
- before_start, after_start = s.text[:start_offset], s.text[start_offset:]
- c = self.clone_simple_element(s.child)
- s.child.text, s.child.tail = before_start, None
- before_end, after_end = e.text[:end_offset], e.text[end_offset:]
- c.text, c.tail = after_start, before_end
- w = self.wrap_contents(c, c)
- w.tail = after_end
- if after_end:
- self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
- else:
- self.pos += 1
- return w.get('id')
- # start and end in text of element
- before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
- c = self.clone_simple_element(s.child)
- s.child.text, s.child.tail = before, None
- c.text, c.tail = sentence, None
- c2 = self.clone_simple_element(c)
- c2.text = after
- self.make_into_wrapper(c)
- if after:
- self.texts[self.pos] = Chunk(c2, after, end)
- else:
- self.pos += 1
- return c.get('id')
- # end is in a subsequent simple child or tail of one
- s.child.text, after_start = s.text[:start_offset], s.text[start_offset:]
- c = self.clone_simple_element(s.child)
- c.text, s.child.tail = after_start, None
- if e.is_tail:
- e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
- w = self.wrap_contents(c, e.child)
- w.tail = after_end
- if after_end:
- self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
- else:
- self.pos += 1
- return w.get('id')
- # end is in text of subsequent simple child
- e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
- c2 = self.clone_simple_element(e.child)
- c2.text, e.child.tail = after_end, None
- w = self.wrap_contents(c, e.child)
- if after_end:
- self.texts[self.pos] = Chunk(c2, after_end, end)
- else:
- self.pos += 1
- return w.get('id')
-
- stack_of_parents = [Parent(elem, 'body', root_lang, root_voice) for elem in root.iterchildren('*') if barename(elem.tag).lower() == 'body']
- while stack_of_parents:
- p = stack_of_parents.pop()
- if len(p.elem) == 1 and not has_text(p.elem): # wrapper
- c = p.elem[0]
- if isinstance(c.tag, str):
- stack_of_parents.append(Parent(c, barename(c.tag).lower(), p.lang, p.voice))
- continue
- for i in range(p.child_pos, len(p.children)):
- child = p.children[i]
- child_voice = child.get('data-calibre-tts', '')
- child_lang = lang_for_elem(child, p.lang)
- child_tag_name = barename(child.tag).lower() if isinstance(child.tag, str) else ''
- if child_lang == p.lang and child_voice == p.voice and child_tag_name in continued_tag_names and len(child) == 0:
- p.add_simple_child(child)
- elif child_tag_name not in ignored_tag_names:
- stack_of_parents.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
- p.commit()
- p.child_pos = i + 1
- stack_of_parents.append(p)
- continue
- if text := child.tail:
- p.add_tail(child, text)
- p.commit()
- for src_elem, clones in clones_map.items():
- for clone in clones + [src_elem]:
- if not clone.text and not clone.tail and not clone.get('id') and not clone.get('name'):
- if (p := clone.getparent()) is not None:
- p.remove(clone)
- return ans
diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py
index 066f146574..e482dd4ee0 100644
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@@ -267,57 +267,6 @@ class TestICU(unittest.TestCase):
}.items():
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
- def test_mark_sentences(self):
- from html5_parser import parse
- from lxml import html
-
- from calibre.spell.break_iterator import mark_sentences_in_html
- for text, expected in reversed({
- 'hello cruel world': '
hello cruel world
',
-
- 'hello cruel world': '
hello cruel world
',
-
- 'Yes, please. Hello cruel world.':
- '
Yes, please. Hello cruel world.
',
-
- 'Hello cruel world. ':
- '
Hello cruel world.
',
-
- 'Yes, please. Well done! Bravissima! ':
- '
Yes, please. Well done! Bravissima!
',
-
- 'Yes, please. Well done! Bravissima! ':
- '
Yes, please. Well done! Bravissima!
',
-
- 'Hello, world! Good day to you':
- '
Hello, world! Good day to you
',
-
- 'Hello, world! Good day to you':
- '
Hello, world! Good day to you
',
-
- 'Hello, world!Good day to you':
- '
Hello, world!Good day to you
',
-
- 'Hello, world! Good day to you':
- '
Hello, world! Good day to you
',
-
- 'Hello, world!':
- 'Hello, world!
',
-
- 'Hello, world!':
- 'Hello, world!
',
- }.items()):
- root = parse(text, namespace_elements=True)
- mark_sentences_in_html(root)
- actual = html.tostring(root, encoding='unicode')
- actual = actual[actual.find('')]
- actual = actual.replace('cttsw-', '')
- self.ae(expected, actual)
- sentences = mark_sentences_in_html(parse('Hello, world!'))
- self.ae(tuple(s.lang for s in sentences), ('eng', 'fra'))
-
-
def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)