Split long sentences before feeding into piper

This commit is contained in:
Kovid Goyal 2024-09-01 21:01:29 +05:30
parent 67498869f1
commit c287218d08
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -17,7 +17,7 @@ from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray,
from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows
from calibre.gui2.tts2.types import TTSBackend from calibre.gui2.tts2.types import TTSBackend
from calibre.ptempfile import base_dir from calibre.ptempfile import base_dir
from calibre.spell.break_iterator import sentence_positions from calibre.spell.break_iterator import sentence_positions, split_into_words_and_positions
@lru_cache(2) @lru_cache(2)
@ -145,17 +145,42 @@ class UtteranceAudioQueue(QIODevice):
return ans return ans
def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: int = 2048):
if len(sentence) <= limit:
yield offset, sentence
return
buf, total, start_at = [], 0, 0
def a(s, e):
nonlocal total, start_at
t = sentence[s:e]
if not buf:
start_at = s
buf.append(t)
total += len(t)
for start, length in split_into_words_and_positions(sentence, lang):
a(start, start + length)
if total >= limit:
yield offset + start_at, ' '.join(buf)
buf, total = [], 0
if buf:
yield offset + start_at, ' '.join(buf)
def split_into_utterances(text: str, counter: count, lang: str = 'en'): def split_into_utterances(text: str, counter: count, lang: str = 'en'):
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ') text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
for start, length in sentence_positions(text, lang): for start, length in sentence_positions(text, lang):
sentence = text[start:start+length].rstrip().replace('\n', ' ') sentence = text[start:start+length].rstrip().replace('\n', ' ')
length = len(sentence) for start, sentence in split_long_sentences(sentence, start, lang):
payload = json.dumps({'text': sentence}).encode('utf-8') payload = json.dumps({'text': sentence}).encode('utf-8')
ba = QByteArray() ba = QByteArray()
ba.reserve(len(payload) + 1) ba.reserve(len(payload) + 1)
ba.append(payload) ba.append(payload)
ba.append(UTTERANCE_SEPARATOR) ba.append(UTTERANCE_SEPARATOR)
yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length) u = Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=len(sentence))
debug(f'Utterance created {u.id}: {sentence}')
yield u
class Piper(TTSBackend): class Piper(TTSBackend):
@ -391,6 +416,7 @@ def develop(): # {{{
'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. ' 'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. '
'Third, and final short sentence.' 'Third, and final short sentence.'
) )
# text = f'Hello world{PARAGRAPH_SEPARATOR}.{PARAGRAPH_SEPARATOR}Bye world'
def saying(offset, length): def saying(offset, length):
debug('Saying:', repr(text[offset:offset+length])) debug('Saying:', repr(text[offset:offset+length]))