When embedding TTS narration dont break sentences on single newlines

2025-07-08 10:44:09 -04:00 · 2024-10-25 14:47:21 +05:30 · 2024-10-25 14:47:21 +05:30 · 5d7c6937de
commit 5d7c6937de
parent 5f9284cbe1
3 changed files with 16 additions and 3 deletions
--- a/src/calibre/ebooks/oeb/polish/tests/structure.py
+++ b/src/calibre/ebooks/oeb/polish/tests/structure.py
@ -251,6 +251,9 @@ class Structure(BaseTest):

            '<p>Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.':
            '<body><p><span id="1">Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.</span></p>',
+
+            '<p>A sentence wrapped\nonto multiple lines.':
+            '<body><p><span id="1">A sentence wrapped\nonto multiple lines.</span></p>',
        }.items()):
            root = parse(text, namespace_elements=True)
            orig = normalize_markup(root)
--- a/src/calibre/ebooks/oeb/polish/tts.py
+++ b/src/calibre/ebooks/oeb/polish/tts.py
@ -18,7 +18,7 @@ from calibre.ebooks.oeb.base import EPUB, EPUB_NS, SMIL_NS, barename
 from calibre.ebooks.oeb.polish.container import OEB_DOCS, seconds_to_timestamp
 from calibre.ebooks.oeb.polish.errors import UnsupportedContainerType
 from calibre.ebooks.oeb.polish.upgrade import upgrade_book
-from calibre.spell.break_iterator import sentence_positions
+from calibre.spell.break_iterator import split_into_sentences_for_tts_embed
 from calibre.utils.localization import canonicalize_lang, get_lang


@ -115,14 +115,14 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
            if self.texts:
                text = ''.join(c.text for c in self.texts)
                self.pos = 0
-                for start, length in sentence_positions(text, self.lang):
+                for start, length in split_into_sentences_for_tts_embed(text, self.lang):
                    elem_id = self.wrap_sentence(start, length)
                    ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
            if self.has_tail:
                p = self.elem.getparent()
                spans = []
                before = after = None
-                for start, length in sentence_positions(self.elem.tail, self.parent_lang):
+                for start, length in split_into_sentences_for_tts_embed(self.elem.tail, self.parent_lang):
                    end = start + length
                    text = self.elem.tail[start:end]
                    if before is None:
--- a/src/calibre/spell/break_iterator.py
+++ b/src/calibre/spell/break_iterator.py
@ -96,6 +96,16 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
 PARAGRAPH_SEPARATOR = '\u2029'


+def split_into_sentences_for_tts_embed(
+    text: str, lang: str = 'en',
+):
+    import re
+    def sub(m):
+        return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
+    text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
+    yield from sentence_positions(text, lang)
+
+
 def split_into_sentences_for_tts(
    text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
    import re