TXT Input: Fix rare failure to convert some large TXT files with non-ascii text. Fixes #1967828 [Turning on "Remove indents at beginning of lines" may cause error in converting](https://bugs.launchpad.net/calibre/+bug/1967828)

2025-07-09 03:04:10 -04:00 · 2022-04-05 15:28:09 +05:30 · 2022-04-05 15:28:09 +05:30 · 57fe3de98e
commit 57fe3de98e
parent 07480ba07c
1 changed files with 16 additions and 2 deletions
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -231,14 +231,28 @@ def opf_writer(path, opf_name, manifest, spine, mi):
        opf.render(opffile)


+def split_utf8(s, n):
+    """Split UTF-8 s into chunks of maximum length n."""
+    if n < 3:
+        raise ValueError(f'Cannot split into chunks of less than {n} < 4 bytes')
+    s = memoryview(s)
+    while len(s) > n:
+        k = n
+        while (s[k] & 0xc0) == 0x80:
+            k -= 1
+        yield bytes(s[:k])
+        s = s[k:]
+    yield bytes(s)
+
+
 def split_string_separator(txt, size):
    '''
    Splits the text by putting \n\n at the point size.
    '''
-    if len(txt) > size and size > 2:
+    if len(txt) > size and size > 3:
        size -= 2
        ans = []
-        for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
+        for part in split_utf8(txt, size):
            idx = part.rfind(b'.')
            if idx == -1:
                part += b'\n\n'