Dont allow too small sentences in the piper backend

The neural net doesnt synthesize too small sentences well
2025-08-11 09:13:57 -04:00 · 2024-09-03 13:44:41 +05:30 · 2024-09-03 13:44:41 +05:30 · 54374e9479
commit 54374e9479
parent 1175711d54
2 changed files with 29 additions and 2 deletions
--- a/src/calibre/spell/break_iterator.py
+++ b/src/calibre/spell/break_iterator.py
@ -96,13 +96,30 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
 PARAGRAPH_SEPARATOR = '\u2029'
-def split_into_sentences_for_tts(text: str, lang: str = 'en', PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
+def split_into_sentences_for_tts(
    text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
    import re
    def sub(m):
        return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
    text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
    pending_start, pending_sentence = 0, ''
    for start, length in sentence_positions(text, lang):
        sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
        if sentence:
-            for start, sentence in split_long_sentences(sentence, start, lang):
+            for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
                if len(sentence) < min_sentence_length:
                    if pending_sentence:
                        pending_sentence += ' ' + sentence
                        if len(pending_sentence) >= min_sentence_length:
                            yield pending_start, pending_sentence
                            pending_start, pending_sentence = 0, ''
                    else:
                        pending_start, pending_sentence = start, sentence
                    continue
                if pending_sentence:
                    sentence = pending_sentence + ' ' + sentence
                    start = pending_start
                    pending_start, pending_sentence = 0, ''
                yield start, sentence
    if pending_sentence:
        yield pending_start, pending_sentence
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -256,6 +256,16 @@ class TestICU(unittest.TestCase):
            }.items():
                self.ae(expected, func(q))
    def test_split_into_sentences(self):
        from calibre.spell.break_iterator import split_into_sentences_for_tts
        for sentence, expected in {
            'hello.': [(0, 'hello.')],
            'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
            'a very long sentence to be split into at least two smaller sentences': [
                (0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
        }.items():
            self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
 def find_tests():
    return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)