From 36d9b620c655a161505e87aac66dda1d3cf9f576 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 3 Sep 2024 13:22:24 +0530 Subject: [PATCH] Make sentence splitting code testable --- src/calibre/gui2/tts2/piper.py | 50 +++++++---------------------- src/calibre/spell/break_iterator.py | 35 ++++++++++++++++++++ 2 files changed, 46 insertions(+), 39 deletions(-) diff --git a/src/calibre/gui2/tts2/piper.py b/src/calibre/gui2/tts2/piper.py index 7a192633f4..58b67f922c 100644 --- a/src/calibre/gui2/tts2/piper.py +++ b/src/calibre/gui2/tts2/piper.py @@ -32,7 +32,7 @@ from qt.core import ( from calibre.constants import cache_dir, is_debugging from calibre.gui2 import error_dialog from calibre.gui2.tts2.types import EngineSpecificSettings, Quality, TTSBackend, Voice, piper_cmdline, widget_parent -from calibre.spell.break_iterator import sentence_positions, split_into_words_and_positions +from calibre.spell.break_iterator import split_into_sentences_for_tts from calibre.utils.localization import canonicalize_lang, get_lang from calibre.utils.resources import get_path as P @@ -58,7 +58,6 @@ class Utterance: synthesized: bool = False -PARAGRAPH_SEPARATOR = '\u2029' UTTERANCE_SEPARATOR = b'\n' @@ -141,44 +140,17 @@ class UtteranceAudioQueue(QIODevice): return ans -def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: int = 2048): - if len(sentence) <= limit: - yield offset, sentence - return - buf, total, start_at = [], 0, 0 - - def a(s, e): - nonlocal total, start_at - t = sentence[s:e] - if not buf: - start_at = s - buf.append(t) - total += len(t) - - for start, length in split_into_words_and_positions(sentence, lang): - a(start, start + length) - if total >= limit: - yield offset + start_at, ' '.join(buf) - buf, total = [], 0 - if buf: - yield offset + start_at, ' '.join(buf) - - def split_into_utterances(text: str, counter: count, lang: str = 'en'): - text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ') - for start, length in sentence_positions(text, lang): - sentence = text[start:start+length].rstrip().replace('\n', ' ').strip() - if sentence: - for start, sentence in split_long_sentences(sentence, start, lang): - payload = json.dumps({'text': sentence}).encode('utf-8') - ba = QByteArray() - ba.reserve(len(payload) + 1) - ba.append(payload) - ba.append(UTTERANCE_SEPARATOR) - u = Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), - left_to_write=ba, start=start, length=len(sentence)) - debug(f'Utterance created {u.id}: {sentence}') - yield u + for start, sentence in split_into_sentences_for_tts(text, lang): + payload = json.dumps({'text': sentence}).encode('utf-8') + ba = QByteArray() + ba.reserve(len(payload) + 1) + ba.append(payload) + ba.append(UTTERANCE_SEPARATOR) + u = Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), + left_to_write=ba, start=start, length=len(sentence)) + debug(f'Utterance created {u.id} {start=}: {sentence!r}') + yield u class Piper(TTSBackend): diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py index 2619ad4dc8..503ae8ea3d 100644 --- a/src/calibre/spell/break_iterator.py +++ b/src/calibre/spell/break_iterator.py @@ -68,3 +68,38 @@ def count_words(text, lang='en'): it = get_iterator(lang) it.set_text(text) return it.count_words() + + +def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: int = 2048): + if len(sentence) <= limit: + yield offset, sentence + return + buf, total, start_at = [], 0, 0 + + def a(s, e): + nonlocal total, start_at + t = sentence[s:e] + if not buf: + start_at = s + buf.append(t) + total += len(t) + + for start, length in split_into_words_and_positions(sentence, lang): + a(start, start + length) + if total >= limit: + yield offset + start_at, ' '.join(buf) + buf, total = [], 0 + if buf: + yield offset + start_at, ' '.join(buf) + + +def split_into_sentences_for_tts(text: str, lang: str = 'en', PARAGRAPH_SEPARATOR: str = '\u2029'): + import re + def sub(m): + return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1) + text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ') + for start, length in sentence_positions(text, lang): + sentence = text[start:start+length].rstrip().replace('\n', ' ').strip() + if sentence: + for start, sentence in split_long_sentences(sentence, start, lang): + yield start, sentence