diff --git a/src/calibre/ebooks/oeb/polish/tests/structure.py b/src/calibre/ebooks/oeb/polish/tests/structure.py index 4c9e0c7a69..833a530b01 100644 --- a/src/calibre/ebooks/oeb/polish/tests/structure.py +++ b/src/calibre/ebooks/oeb/polish/tests/structure.py @@ -251,6 +251,9 @@ class Structure(BaseTest): '
Here is some bold, italic, underline, text.': '
Here is some bold, italic, underline, text.
', + + 'A sentence wrapped\nonto multiple lines.': + '
A sentence wrapped\nonto multiple lines.
', }.items()): root = parse(text, namespace_elements=True) orig = normalize_markup(root) diff --git a/src/calibre/ebooks/oeb/polish/tts.py b/src/calibre/ebooks/oeb/polish/tts.py index 6e122ea7f5..3e3c96913d 100644 --- a/src/calibre/ebooks/oeb/polish/tts.py +++ b/src/calibre/ebooks/oeb/polish/tts.py @@ -18,7 +18,7 @@ from calibre.ebooks.oeb.base import EPUB, EPUB_NS, SMIL_NS, barename from calibre.ebooks.oeb.polish.container import OEB_DOCS, seconds_to_timestamp from calibre.ebooks.oeb.polish.errors import UnsupportedContainerType from calibre.ebooks.oeb.polish.upgrade import upgrade_book -from calibre.spell.break_iterator import sentence_positions +from calibre.spell.break_iterator import split_into_sentences_for_tts_embed from calibre.utils.localization import canonicalize_lang, get_lang @@ -115,14 +115,14 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten if self.texts: text = ''.join(c.text for c in self.texts) self.pos = 0 - for start, length in sentence_positions(text, self.lang): + for start, length in split_into_sentences_for_tts_embed(text, self.lang): elem_id = self.wrap_sentence(start, length) ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice)) if self.has_tail: p = self.elem.getparent() spans = [] before = after = None - for start, length in sentence_positions(self.elem.tail, self.parent_lang): + for start, length in split_into_sentences_for_tts_embed(self.elem.tail, self.parent_lang): end = start + length text = self.elem.tail[start:end] if before is None: diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py index cba9b3cf8c..506c4d83d3 100644 --- a/src/calibre/spell/break_iterator.py +++ b/src/calibre/spell/break_iterator.py @@ -96,6 +96,16 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in PARAGRAPH_SEPARATOR = '\u2029' +def split_into_sentences_for_tts_embed( + text: str, lang: str = 'en', +): + import re + def sub(m): + return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1) + text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ') + yield from sentence_positions(text, lang) + + def split_into_sentences_for_tts( text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR): import re