Dont allow too small sentences in the piper backend

The neural net doesnt synthesize too small sentences well
This commit is contained in:
Kovid Goyal 2024-09-03 13:44:41 +05:30
parent 1175711d54
commit 54374e9479
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 29 additions and 2 deletions

View File

@ -96,13 +96,30 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
PARAGRAPH_SEPARATOR = '\u2029'
def split_into_sentences_for_tts(text: str, lang: str = 'en', PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
def split_into_sentences_for_tts(
text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
import re
def sub(m):
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
pending_start, pending_sentence = 0, ''
for start, length in sentence_positions(text, lang):
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
if sentence:
for start, sentence in split_long_sentences(sentence, start, lang):
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
if len(sentence) < min_sentence_length:
if pending_sentence:
pending_sentence += ' ' + sentence
if len(pending_sentence) >= min_sentence_length:
yield pending_start, pending_sentence
pending_start, pending_sentence = 0, ''
else:
pending_start, pending_sentence = start, sentence
continue
if pending_sentence:
sentence = pending_sentence + ' ' + sentence
start = pending_start
pending_start, pending_sentence = 0, ''
yield start, sentence
if pending_sentence:
yield pending_start, pending_sentence

View File

@ -256,6 +256,16 @@ class TestICU(unittest.TestCase):
}.items():
self.ae(expected, func(q))
def test_split_into_sentences(self):
from calibre.spell.break_iterator import split_into_sentences_for_tts
for sentence, expected in {
'hello.': [(0, 'hello.')],
'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
'a very long sentence to be split into at least two smaller sentences': [
(0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
}.items():
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)