diff --git a/src/calibre/spell/break_iterator.py b/src/calibre/spell/break_iterator.py index 8f6cf7407c..d73c151d3b 100644 --- a/src/calibre/spell/break_iterator.py +++ b/src/calibre/spell/break_iterator.py @@ -104,22 +104,24 @@ def split_into_sentences_for_tts( text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ') pending_start, pending_sentence = 0, '' for start, length in sentence_positions(text, lang): + end = start + length sentence = text[start:start+length].rstrip().replace('\n', ' ').strip() - if sentence: - for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length): - if len(sentence) < min_sentence_length: - if pending_sentence: - pending_sentence += ' ' + sentence - if len(pending_sentence) >= min_sentence_length: - yield pending_start, pending_sentence - pending_start, pending_sentence = 0, '' - else: - pending_start, pending_sentence = start, sentence - continue - if pending_sentence: - sentence = pending_sentence + ' ' + sentence - start = pending_start + if not sentence: + continue + if len(sentence) < min_sentence_length and text[end-1] != PARAGRAPH_SEPARATOR: + if pending_sentence: + pending_sentence += ' ' + sentence + if len(pending_sentence) >= min_sentence_length: + yield pending_start, pending_sentence pending_start, pending_sentence = 0, '' - yield start, sentence + else: + pending_start, pending_sentence = start, sentence + continue + for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length): + if pending_sentence: + sentence = pending_sentence + ' ' + sentence + start = pending_start + pending_start, pending_sentence = 0, '' + yield start, sentence if pending_sentence: yield pending_start, pending_sentence diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index 36d1cde597..e482dd4ee0 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -263,6 +263,7 @@ class TestICU(unittest.TestCase): 'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')], 'a very long sentence to be split into at least two smaller sentences': [ (0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')], + 'hello\u2029i love you': [(0, 'hello'), (6, 'i love you')], }.items(): self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40))) diff --git a/src/pyj/read_book/find.pyj b/src/pyj/read_book/find.pyj index 5bfdcaf636..2861079e72 100644 --- a/src/pyj/read_book/find.pyj +++ b/src/pyj/read_book/find.pyj @@ -36,9 +36,11 @@ def build_text_map(for_tts): process_node(v'children[i]') if for_tts and block_tags_for_tts[tag]: # add a paragraph separator after block tags so that sentence splitting works - if node_list.length: - node_list[-1].length += 1 + if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1: + flat_text = flat_text[:-1] + '\u2029' + elif node_list.length: flat_text += '\u2029' + node_list[-1].length += 1 process_node(document.body) return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}