mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Dont join short sentences if they are from a block tag or followed by multiple newlines
This commit is contained in:
parent
6a2582d9ab
commit
672bdcc149
@ -104,22 +104,24 @@ def split_into_sentences_for_tts(
|
||||
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
||||
pending_start, pending_sentence = 0, ''
|
||||
for start, length in sentence_positions(text, lang):
|
||||
end = start + length
|
||||
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
||||
if sentence:
|
||||
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
|
||||
if len(sentence) < min_sentence_length:
|
||||
if pending_sentence:
|
||||
pending_sentence += ' ' + sentence
|
||||
if len(pending_sentence) >= min_sentence_length:
|
||||
yield pending_start, pending_sentence
|
||||
pending_start, pending_sentence = 0, ''
|
||||
else:
|
||||
pending_start, pending_sentence = start, sentence
|
||||
continue
|
||||
if pending_sentence:
|
||||
sentence = pending_sentence + ' ' + sentence
|
||||
start = pending_start
|
||||
if not sentence:
|
||||
continue
|
||||
if len(sentence) < min_sentence_length and text[end-1] != PARAGRAPH_SEPARATOR:
|
||||
if pending_sentence:
|
||||
pending_sentence += ' ' + sentence
|
||||
if len(pending_sentence) >= min_sentence_length:
|
||||
yield pending_start, pending_sentence
|
||||
pending_start, pending_sentence = 0, ''
|
||||
yield start, sentence
|
||||
else:
|
||||
pending_start, pending_sentence = start, sentence
|
||||
continue
|
||||
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
|
||||
if pending_sentence:
|
||||
sentence = pending_sentence + ' ' + sentence
|
||||
start = pending_start
|
||||
pending_start, pending_sentence = 0, ''
|
||||
yield start, sentence
|
||||
if pending_sentence:
|
||||
yield pending_start, pending_sentence
|
||||
|
@ -263,6 +263,7 @@ class TestICU(unittest.TestCase):
|
||||
'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
|
||||
'a very long sentence to be split into at least two smaller sentences': [
|
||||
(0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
|
||||
'hello\u2029i love you': [(0, 'hello'), (6, 'i love you')],
|
||||
}.items():
|
||||
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
||||
|
||||
|
@ -36,9 +36,11 @@ def build_text_map(for_tts):
|
||||
process_node(v'children[i]')
|
||||
if for_tts and block_tags_for_tts[tag]:
|
||||
# add a paragraph separator after block tags so that sentence splitting works
|
||||
if node_list.length:
|
||||
node_list[-1].length += 1
|
||||
if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
|
||||
flat_text = flat_text[:-1] + '\u2029'
|
||||
elif node_list.length:
|
||||
flat_text += '\u2029'
|
||||
node_list[-1].length += 1
|
||||
|
||||
process_node(document.body)
|
||||
return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}
|
||||
|
Loading…
x
Reference in New Issue
Block a user