Dont allow too small sentences in the piper backend

The neural net doesnt synthesize too small sentences well
2025-08-11 09:13:57 -04:00 · 2024-09-03 13:44:41 +05:30 · 2024-09-03 13:44:41 +05:30 · 54374e9479
commit 54374e9479
parent 1175711d54
2 changed files with 29 additions and 2 deletions
--- a/src/calibre/spell/break_iterator.py
+++ b/src/calibre/spell/break_iterator.py
@ -96,13 +96,30 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
 PARAGRAPH_SEPARATOR = '\u2029'


-def split_into_sentences_for_tts(text: str, lang: str = 'en', PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
+def split_into_sentences_for_tts(
+    text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
    import re
    def sub(m):
        return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
    text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
+    pending_start, pending_sentence = 0, ''
    for start, length in sentence_positions(text, lang):
        sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
        if sentence:
-            for start, sentence in split_long_sentences(sentence, start, lang):
+            for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
+                if len(sentence) < min_sentence_length:
+                    if pending_sentence:
+                        pending_sentence += ' ' + sentence
+                        if len(pending_sentence) >= min_sentence_length:
+                            yield pending_start, pending_sentence
+                            pending_start, pending_sentence = 0, ''
+                    else:
+                        pending_start, pending_sentence = start, sentence
+                    continue
+                if pending_sentence:
+                    sentence = pending_sentence + ' ' + sentence
+                    start = pending_start
+                    pending_start, pending_sentence = 0, ''
                yield start, sentence
+    if pending_sentence:
+        yield pending_start, pending_sentence
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -256,6 +256,16 @@ class TestICU(unittest.TestCase):
            }.items():
                self.ae(expected, func(q))

+    def test_split_into_sentences(self):
+        from calibre.spell.break_iterator import split_into_sentences_for_tts
+        for sentence, expected in {
+            'hello.': [(0, 'hello.')],
+            'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
+            'a very long sentence to be split into at least two smaller sentences': [
+                (0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
+        }.items():
+            self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
+

 def find_tests():
    return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)