mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Dont allow too small sentences in the piper backend
The neural net doesnt synthesize too small sentences well
This commit is contained in:
parent
1175711d54
commit
54374e9479
@ -96,13 +96,30 @@ def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: in
|
|||||||
PARAGRAPH_SEPARATOR = '\u2029'
|
PARAGRAPH_SEPARATOR = '\u2029'
|
||||||
|
|
||||||
|
|
||||||
def split_into_sentences_for_tts(text: str, lang: str = 'en', PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
|
def split_into_sentences_for_tts(
|
||||||
|
text: str, lang: str = 'en', min_sentence_length: int = 32, max_sentence_length: int = 1024, PARAGRAPH_SEPARATOR: str = PARAGRAPH_SEPARATOR):
|
||||||
import re
|
import re
|
||||||
def sub(m):
|
def sub(m):
|
||||||
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
|
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
|
||||||
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
||||||
|
pending_start, pending_sentence = 0, ''
|
||||||
for start, length in sentence_positions(text, lang):
|
for start, length in sentence_positions(text, lang):
|
||||||
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
||||||
if sentence:
|
if sentence:
|
||||||
for start, sentence in split_long_sentences(sentence, start, lang):
|
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
|
||||||
|
if len(sentence) < min_sentence_length:
|
||||||
|
if pending_sentence:
|
||||||
|
pending_sentence += ' ' + sentence
|
||||||
|
if len(pending_sentence) >= min_sentence_length:
|
||||||
|
yield pending_start, pending_sentence
|
||||||
|
pending_start, pending_sentence = 0, ''
|
||||||
|
else:
|
||||||
|
pending_start, pending_sentence = start, sentence
|
||||||
|
continue
|
||||||
|
if pending_sentence:
|
||||||
|
sentence = pending_sentence + ' ' + sentence
|
||||||
|
start = pending_start
|
||||||
|
pending_start, pending_sentence = 0, ''
|
||||||
yield start, sentence
|
yield start, sentence
|
||||||
|
if pending_sentence:
|
||||||
|
yield pending_start, pending_sentence
|
||||||
|
@ -256,6 +256,16 @@ class TestICU(unittest.TestCase):
|
|||||||
}.items():
|
}.items():
|
||||||
self.ae(expected, func(q))
|
self.ae(expected, func(q))
|
||||||
|
|
||||||
|
def test_split_into_sentences(self):
|
||||||
|
from calibre.spell.break_iterator import split_into_sentences_for_tts
|
||||||
|
for sentence, expected in {
|
||||||
|
'hello.': [(0, 'hello.')],
|
||||||
|
'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
|
||||||
|
'a very long sentence to be split into at least two smaller sentences': [
|
||||||
|
(0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
|
||||||
|
}.items():
|
||||||
|
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
|
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user