mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
When embedding TTS narration dont break sentences on single newlines
This commit is contained in:
parent
5f9284cbe1
commit
5d7c6937de
@ -251,6 +251,9 @@ class Structure(BaseTest):
|
|||||||
|
|
||||||
'<p>Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.':
|
'<p>Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.':
|
||||||
'<body><p><span id="1">Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.</span></p>',
|
'<body><p><span id="1">Here is some <b>bold, </b><i>italic, </i><u>underline, </u> text.</span></p>',
|
||||||
|
|
||||||
|
'<p>A sentence wrapped\nonto multiple lines.':
|
||||||
|
'<body><p><span id="1">A sentence wrapped\nonto multiple lines.</span></p>',
|
||||||
}.items()):
|
}.items()):
|
||||||
root = parse(text, namespace_elements=True)
|
root = parse(text, namespace_elements=True)
|
||||||
orig = normalize_markup(root)
|
orig = normalize_markup(root)
|
||||||
|
@ -18,7 +18,7 @@ from calibre.ebooks.oeb.base import EPUB, EPUB_NS, SMIL_NS, barename
|
|||||||
from calibre.ebooks.oeb.polish.container import OEB_DOCS, seconds_to_timestamp
|
from calibre.ebooks.oeb.polish.container import OEB_DOCS, seconds_to_timestamp
|
||||||
from calibre.ebooks.oeb.polish.errors import UnsupportedContainerType
|
from calibre.ebooks.oeb.polish.errors import UnsupportedContainerType
|
||||||
from calibre.ebooks.oeb.polish.upgrade import upgrade_book
|
from calibre.ebooks.oeb.polish.upgrade import upgrade_book
|
||||||
from calibre.spell.break_iterator import sentence_positions
|
from calibre.spell.break_iterator import split_into_sentences_for_tts_embed
|
||||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
|
|
||||||
|
|
||||||
@ -115,14 +115,14 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
if self.texts:
|
if self.texts:
|
||||||
text = ''.join(c.text for c in self.texts)
|
text = ''.join(c.text for c in self.texts)
|
||||||
self.pos = 0
|
self.pos = 0
|
||||||
for start, length in sentence_positions(text, self.lang):
|
for start, length in split_into_sentences_for_tts_embed(text, self.lang):
|
||||||
elem_id = self.wrap_sentence(start, length)
|
elem_id = self.wrap_sentence(start, length)
|
||||||
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
||||||
if self.has_tail:
|
if self.has_tail:
|
||||||
p = self.elem.getparent()
|
p = self.elem.getparent()
|
||||||
spans = []
|
spans = []
|
||||||
before = after = None
|
before = after = None
|
||||||
for start, length in sentence_positions(self.elem.tail, self.parent_lang):
|
for start, length in split_into_sentences_for_tts_embed(self.elem.tail, self.parent_lang):
|
||||||
end = start + length
|
end = start + length
|
||||||
text = self.elem.tail[start:end]
|
text = self.elem.tail[start:end]
|
||||||
if before is None:
|
if before is None:
|
||||||
|
@ -96,6 +96,16 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
|
|||||||
PARAGRAPH_SEPARATOR = '\u2029'
|
PARAGRAPH_SEPARATOR = '\u2029'
|
||||||
|
|
||||||
|
|
||||||
|
def split_into_sentences_for_tts_embed(
|
||||||
|
text: str, lang: str = 'en',
|
||||||
|
):
|
||||||
|
import re
|
||||||
|
def sub(m):
|
||||||
|
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
|
||||||
|
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
||||||
|
yield from sentence_positions(text, lang)
|
||||||
|
|
||||||
|
|
||||||
def split_into_sentences_for_tts(
|
def split_into_sentences_for_tts(
|
||||||
text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
|
text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
|
||||||
import re
|
import re
|
||||||
|
Loading…
x
Reference in New Issue
Block a user