mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
When embedding TTS narration dont break sentences on single newlines
This commit is contained in:
parent
5f9284cbe1
commit
5d7c6937de
@ -251,6 +251,9 @@ class Structure(BaseTest):
|
||||
|
||||
'<p>Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.':
|
||||
'<body><p><span id="1">Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.</span></p>',
|
||||
|
||||
'<p>A sentence wrapped\nonto multiple lines.':
|
||||
'<body><p><span id="1">A sentence wrapped\nonto multiple lines.</span></p>',
|
||||
}.items()):
|
||||
root = parse(text, namespace_elements=True)
|
||||
orig = normalize_markup(root)
|
||||
|
@ -18,7 +18,7 @@ from calibre.ebooks.oeb.base import EPUB, EPUB_NS, SMIL_NS, barename
|
||||
from calibre.ebooks.oeb.polish.container import OEB_DOCS, seconds_to_timestamp
|
||||
from calibre.ebooks.oeb.polish.errors import UnsupportedContainerType
|
||||
from calibre.ebooks.oeb.polish.upgrade import upgrade_book
|
||||
from calibre.spell.break_iterator import sentence_positions
|
||||
from calibre.spell.break_iterator import split_into_sentences_for_tts_embed
|
||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||
|
||||
|
||||
@ -115,14 +115,14 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
||||
if self.texts:
|
||||
text = ''.join(c.text for c in self.texts)
|
||||
self.pos = 0
|
||||
for start, length in sentence_positions(text, self.lang):
|
||||
for start, length in split_into_sentences_for_tts_embed(text, self.lang):
|
||||
elem_id = self.wrap_sentence(start, length)
|
||||
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
||||
if self.has_tail:
|
||||
p = self.elem.getparent()
|
||||
spans = []
|
||||
before = after = None
|
||||
for start, length in sentence_positions(self.elem.tail, self.parent_lang):
|
||||
for start, length in split_into_sentences_for_tts_embed(self.elem.tail, self.parent_lang):
|
||||
end = start + length
|
||||
text = self.elem.tail[start:end]
|
||||
if before is None:
|
||||
|
@ -96,6 +96,16 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
|
||||
PARAGRAPH_SEPARATOR = '\u2029'
|
||||
|
||||
|
||||
def split_into_sentences_for_tts_embed(
|
||||
text: str, lang: str = 'en',
|
||||
):
|
||||
import re
|
||||
def sub(m):
|
||||
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
|
||||
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
||||
yield from sentence_positions(text, lang)
|
||||
|
||||
|
||||
def split_into_sentences_for_tts(
|
||||
text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
|
||||
import re
|
||||
|
Loading…
x
Reference in New Issue
Block a user