Dont join short sentences if they are from a block tag or followed by multiple newlines

This commit is contained in:
Kovid Goyal 2024-09-03 16:11:48 +05:30
parent 6a2582d9ab
commit 672bdcc149
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 22 additions and 17 deletions

View File

@ -104,22 +104,24 @@ def split_into_sentences_for_tts(
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
pending_start, pending_sentence = 0, ''
for start, length in sentence_positions(text, lang):
end = start + length
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
if sentence:
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
if len(sentence) < min_sentence_length:
if pending_sentence:
pending_sentence += ' ' + sentence
if len(pending_sentence) >= min_sentence_length:
yield pending_start, pending_sentence
pending_start, pending_sentence = 0, ''
else:
pending_start, pending_sentence = start, sentence
continue
if pending_sentence:
sentence = pending_sentence + ' ' + sentence
start = pending_start
if not sentence:
continue
if len(sentence) < min_sentence_length and text[end-1] != PARAGRAPH_SEPARATOR:
if pending_sentence:
pending_sentence += ' ' + sentence
if len(pending_sentence) >= min_sentence_length:
yield pending_start, pending_sentence
pending_start, pending_sentence = 0, ''
yield start, sentence
else:
pending_start, pending_sentence = start, sentence
continue
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
if pending_sentence:
sentence = pending_sentence + ' ' + sentence
start = pending_start
pending_start, pending_sentence = 0, ''
yield start, sentence
if pending_sentence:
yield pending_start, pending_sentence

View File

@ -263,6 +263,7 @@ class TestICU(unittest.TestCase):
'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
'a very long sentence to be split into at least two smaller sentences': [
(0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
'hello\u2029i love you': [(0, 'hello'), (6, 'i love you')],
}.items():
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))

View File

@ -36,9 +36,11 @@ def build_text_map(for_tts):
process_node(v'children[i]')
if for_tts and block_tags_for_tts[tag]:
# add a paragraph separator after block tags so that sentence splitting works
if node_list.length:
node_list[-1].length += 1
if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
flat_text = flat_text[:-1] + '\u2029'
elif node_list.length:
flat_text += '\u2029'
node_list[-1].length += 1
process_node(document.body)
return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}