From 57fe3de98ed39355ef76e6973cb8750b1cb99e33 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 5 Apr 2022 15:28:09 +0530 Subject: [PATCH] TXT Input: Fix rare failure to convert some large TXT files with non-ascii text. Fixes #1967828 [Turning on "Remove indents at beginning of lines" may cause error in converting](https://bugs.launchpad.net/calibre/+bug/1967828) --- src/calibre/ebooks/txt/processor.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 65a97dba4d..afeb93bc1f 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -231,14 +231,28 @@ def opf_writer(path, opf_name, manifest, spine, mi): opf.render(opffile) +def split_utf8(s, n): + """Split UTF-8 s into chunks of maximum length n.""" + if n < 3: + raise ValueError(f'Cannot split into chunks of less than {n} < 4 bytes') + s = memoryview(s) + while len(s) > n: + k = n + while (s[k] & 0xc0) == 0x80: + k -= 1 + yield bytes(s[:k]) + s = s[k:] + yield bytes(s) + + def split_string_separator(txt, size): ''' Splits the text by putting \n\n at the point size. ''' - if len(txt) > size and size > 2: + if len(txt) > size and size > 3: size -= 2 ans = [] - for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)): + for part in split_utf8(txt, size): idx = part.rfind(b'.') if idx == -1: part += b'\n\n'