Dont join short sentences if they are from a block tag or followed by multiple newlines

2025-07-09 03:04:10 -04:00 · 2024-09-03 16:11:48 +05:30 · 2024-09-03 16:11:48 +05:30 · 672bdcc149
commit 672bdcc149
parent 6a2582d9ab
3 changed files with 22 additions and 17 deletions
--- a/src/calibre/spell/break_iterator.py
+++ b/src/calibre/spell/break_iterator.py
@ -104,22 +104,24 @@ def split_into_sentences_for_tts(
    text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
    pending_start, pending_sentence = 0, ''
    for start, length in sentence_positions(text, lang):
+        end = start + length
        sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
-        if sentence:
-            for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
-                if len(sentence) < min_sentence_length:
-                    if pending_sentence:
-                        pending_sentence += ' ' + sentence
-                        if len(pending_sentence) >= min_sentence_length:
-                            yield pending_start, pending_sentence
-                            pending_start, pending_sentence = 0, ''
-                    else:
-                        pending_start, pending_sentence = start, sentence
-                    continue
-                if pending_sentence:
-                    sentence = pending_sentence + ' ' + sentence
-                    start = pending_start
+        if not sentence:
+            continue
+        if len(sentence) < min_sentence_length and text[end-1] != PARAGRAPH_SEPARATOR:
+            if pending_sentence:
+                pending_sentence += ' ' + sentence
+                if len(pending_sentence) >= min_sentence_length:
+                    yield pending_start, pending_sentence
                    pending_start, pending_sentence = 0, ''
-                yield start, sentence
+            else:
+                pending_start, pending_sentence = start, sentence
+            continue
+        for start, sentence in split_long_sentences(sentence, start, lang, limit=max_sentence_length):
+            if pending_sentence:
+                sentence = pending_sentence + ' ' + sentence
+                start = pending_start
+                pending_start, pending_sentence = 0, ''
+            yield start, sentence
    if pending_sentence:
        yield pending_start, pending_sentence
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -263,6 +263,7 @@ class TestICU(unittest.TestCase):
            'hello. I love you. Another small sentence. Fini.': [(0, 'hello. I love you. Another small sentence.'), (43, 'Fini.')],
            'a very long sentence to be split into at least two smaller sentences': [
                (0, 'a very long sentence to be split into at least two'), (51, 'smaller sentences')],
+            'hello\u2029i love you': [(0, 'hello'), (6, 'i love you')],
        }.items():
            self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))

--- a/src/pyj/read_book/find.pyj
+++ b/src/pyj/read_book/find.pyj
@ -36,9 +36,11 @@ def build_text_map(for_tts):
                process_node(v'children[i]')
            if for_tts and block_tags_for_tts[tag]:
                # add a paragraph separator after block tags so that sentence splitting works
-                if node_list.length:
-                    node_list[-1].length += 1
+                if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
+                    flat_text = flat_text[:-1] + '\u2029'
+                elif node_list.length:
                    flat_text += '\u2029'
+                    node_list[-1].length += 1

    process_node(document.body)
    return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}