mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Dont join short sentences if they are from a block tag or followed by multiple newlines
This commit is contained in:
parent
6a2582d9ab
commit
672bdcc149
@ -104,22 +104,24 @@ def split_into_sentences_for_tts(
|
|||||||
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
||||||
pending_start, pending_sentence = 0, ''
|
pending_start, pending_sentence = 0, ''
|
||||||
for start, length in sentence_positions(text, lang):
|
for start, length in sentence_positions(text, lang):
|
||||||
|
end = start + length
|
||||||
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
||||||
if sentence:
|
if not sentence:
|
||||||
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
|
continue
|
||||||
if len(sentence) < min_sentence_length:
|
if len(sentence) < min_sentence_length and text[end-1] != PARAGRAPH_SEPARATOR:
|
||||||
if pending_sentence:
|
if pending_sentence:
|
||||||
pending_sentence += ' ' + sentence
|
pending_sentence += ' ' + sentence
|
||||||
if len(pending_sentence) >= min_sentence_length:
|
if len(pending_sentence) >= min_sentence_length:
|
||||||
yield pending_start, pending_sentence
|
yield pending_start, pending_sentence
|
||||||
pending_start, pending_sentence = 0, ''
|
|
||||||
else:
|
|
||||||
pending_start, pending_sentence = start, sentence
|
|
||||||
continue
|
|
||||||
if pending_sentence:
|
|
||||||
sentence = pending_sentence + ' ' + sentence
|
|
||||||
start = pending_start
|
|
||||||
pending_start, pending_sentence = 0, ''
|
pending_start, pending_sentence = 0, ''
|
||||||
yield start, sentence
|
else:
|
||||||
|
pending_start, pending_sentence = start, sentence
|
||||||
|
continue
|
||||||
|
for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
|
||||||
|
if pending_sentence:
|
||||||
|
sentence = pending_sentence + ' ' + sentence
|
||||||
|
start = pending_start
|
||||||
|
pending_start, pending_sentence = 0, ''
|
||||||
|
yield start, sentence
|
||||||
if pending_sentence:
|
if pending_sentence:
|
||||||
yield pending_start, pending_sentence
|
yield pending_start, pending_sentence
|
||||||
|
@ -263,6 +263,7 @@ class TestICU(unittest.TestCase):
|
|||||||
'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
|
'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
|
||||||
'a very long sentence to be split into at least two smaller sentences': [
|
'a very long sentence to be split into at least two smaller sentences': [
|
||||||
(0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
|
(0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
|
||||||
|
'hello\u2029i love you': [(0, 'hello'), (6, 'i love you')],
|
||||||
}.items():
|
}.items():
|
||||||
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
||||||
|
|
||||||
|
@ -36,9 +36,11 @@ def build_text_map(for_tts):
|
|||||||
process_node(v'children[i]')
|
process_node(v'children[i]')
|
||||||
if for_tts and block_tags_for_tts[tag]:
|
if for_tts and block_tags_for_tts[tag]:
|
||||||
# add a paragraph separator after block tags so that sentence splitting works
|
# add a paragraph separator after block tags so that sentence splitting works
|
||||||
if node_list.length:
|
if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
|
||||||
node_list[-1].length += 1
|
flat_text = flat_text[:-1] + '\u2029'
|
||||||
|
elif node_list.length:
|
||||||
flat_text += '\u2029'
|
flat_text += '\u2029'
|
||||||
|
node_list[-1].length += 1
|
||||||
|
|
||||||
process_node(document.body)
|
process_node(document.body)
|
||||||
return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}
|
return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user