From 57fe3de98ed39355ef76e6973cb8750b1cb99e33 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 5 Apr 2022 15:28:09 +0530
Subject: [PATCH] TXT Input: Fix rare failure to convert some large TXT files
 with non-ascii text. Fixes #1967828 [Turning on "Remove indents at beginning
 of lines" may cause error in
 converting](https://bugs.launchpad.net/calibre/+bug/1967828)

---
 src/calibre/ebooks/txt/processor.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 65a97dba4d..afeb93bc1f 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -231,14 +231,28 @@ def opf_writer(path, opf_name, manifest, spine, mi):
         opf.render(opffile)
 
 
+def split_utf8(s, n):
+    """Split UTF-8 s into chunks of maximum length n."""
+    if n < 3:
+        raise ValueError(f'Cannot split into chunks of less than {n} < 4 bytes')
+    s = memoryview(s)
+    while len(s) > n:
+        k = n
+        while (s[k] & 0xc0) == 0x80:
+            k -= 1
+        yield bytes(s[:k])
+        s = s[k:]
+    yield bytes(s)
+
+
 def split_string_separator(txt, size):
     '''
     Splits the text by putting \n\n at the point size.
     '''
-    if len(txt) > size and size > 2:
+    if len(txt) > size and size > 3:
         size -= 2
         ans = []
-        for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
+        for part in split_utf8(txt, size):
             idx = part.rfind(b'.')
             if idx == -1:
                 part += b'\n\n'