TXT Input: Fix incorrect splitting of txt with large chunks causing some text to be dropped. Fixes #1895888 [txt to epub missing lines](https://bugs.launchpad.net/calibre/+bug/1895888)

2025-06-23 15:30:45 -04:00 · 2020-09-17 14:05:40 +05:30 · 2020-09-17 14:05:40 +05:30 · 9ec29bb548
commit 9ec29bb548
parent 7a4b3f61ff
1 changed files with 13 additions and 15 deletions
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -17,7 +17,7 @@ from calibre.ebooks.metadata.opf2 import OPFCreator

 from calibre.ebooks.conversion.preprocess import DocAnalysis
 from calibre.utils.cleantext import clean_ascii_chars
-from polyglot.builtins import iteritems, unicode_type, map, range, long_type
+from polyglot.builtins import iteritems, unicode_type, map, range

 HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'

@ -53,7 +53,7 @@ def clean_txt(txt):
 def split_txt(txt, epub_split_size_kb=0):
    '''
    Ensure there are split points for converting
-    to EPUB. A misdetected paragraph type can
+    to EPUB. A mis-detected paragraph type can
    result in the entire document being one giant
    paragraph. In this case the EPUB parser will not
    be able to determine where to split the file
@ -64,16 +64,14 @@ def split_txt(txt, epub_split_size_kb=0):
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode_type):
            txt = txt.encode('utf-8')
-        length_byte = len(txt)
-        # Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
-        chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
-        # if there are chunks with a superior size then go and break
-        parts = txt.split(b'\n\n')
-        lengths = tuple(map(len, parts))
-        if lengths and max(lengths) > chunk_size:
-            txt = b'\n\n'.join([
-                split_string_separator(line, chunk_size) for line in parts
-            ])
+        if len(txt) > epub_split_size_kb * 1024:
+            chunk_size = max(16, epub_split_size_kb - 32) * 1024
+            # if there are chunks with a superior size then go and break
+            parts = txt.split(b'\n\n')
+            if parts and max(map(len, parts)) > chunk_size:
+                txt = b'\n\n'.join(
+                    split_string_separator(line, chunk_size) for line in parts
+                )
    if isbytestring(txt):
        txt = txt.decode('utf-8')

@ -242,15 +240,15 @@ def split_string_separator(txt, size):
    '''
    if len(txt) > size and size > 2:
        size -= 2
-        txt = []
+        ans = []
        for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
            idx = part.rfind(b'.')
            if idx == -1:
                part += b'\n\n'
            else:
                part = part[:idx + 1] + b'\n\n' + part[idx:]
-            txt.append(part)
-        txt = b''.join(txt)
+            ans.append(part)
+        txt = b''.join(ans)
    return txt