When embedding TTS narration dont break sentences on single newlines

This commit is contained in:
Kovid Goyal 2024-10-25 14:47:21 +05:30
parent 5f9284cbe1
commit 5d7c6937de
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 16 additions and 3 deletions

View File

@ -251,6 +251,9 @@ class Structure(BaseTest):
'<p>Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.':
'<body><p><span id="1">Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.</span></p>',
'<p>A sentence wrapped\nonto multiple lines.':
'<body><p><span id="1">A sentence wrapped\nonto multiple lines.</span></p>',
}.items()):
root = parse(text, namespace_elements=True)
orig = normalize_markup(root)

View File

@ -18,7 +18,7 @@ from calibre.ebooks.oeb.base import EPUB, EPUB_NS, SMIL_NS, barename
from calibre.ebooks.oeb.polish.container import OEB_DOCS, seconds_to_timestamp
from calibre.ebooks.oeb.polish.errors import UnsupportedContainerType
from calibre.ebooks.oeb.polish.upgrade import upgrade_book
from calibre.spell.break_iterator import sentence_positions
from calibre.spell.break_iterator import split_into_sentences_for_tts_embed
from calibre.utils.localization import canonicalize_lang, get_lang
@ -115,14 +115,14 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
if self.texts:
text = ''.join(c.text for c in self.texts)
self.pos = 0
for start, length in sentence_positions(text, self.lang):
for start, length in split_into_sentences_for_tts_embed(text, self.lang):
elem_id = self.wrap_sentence(start, length)
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
if self.has_tail:
p = self.elem.getparent()
spans = []
before = after = None
for start, length in sentence_positions(self.elem.tail, self.parent_lang):
for start, length in split_into_sentences_for_tts_embed(self.elem.tail, self.parent_lang):
end = start + length
text = self.elem.tail[start:end]
if before is None:

View File

@ -96,6 +96,16 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
PARAGRAPH_SEPARATOR = '\u2029'
def split_into_sentences_for_tts_embed(
text: str, lang: str = 'en',
):
import re
def sub(m):
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
yield from sentence_positions(text, lang)
def split_into_sentences_for_tts(
text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
import re