TXT Input: Fix rare failure to convert some large TXT files with non-ascii text. Fixes #1967828 [Turning on "Remove indents at beginning of lines" may cause error in converting](https://bugs.launchpad.net/calibre/+bug/1967828)

This commit is contained in:
Kovid Goyal 2022-04-05 15:28:09 +05:30
parent 07480ba07c
commit 57fe3de98e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -231,14 +231,28 @@ def opf_writer(path, opf_name, manifest, spine, mi):
opf.render(opffile)
def split_utf8(s, n):
"""Split UTF-8 s into chunks of maximum length n."""
if n < 3:
raise ValueError(f'Cannot split into chunks of less than {n} < 4 bytes')
s = memoryview(s)
while len(s) > n:
k = n
while (s[k] & 0xc0) == 0x80:
k -= 1
yield bytes(s[:k])
s = s[k:]
yield bytes(s)
def split_string_separator(txt, size):
'''
Splits the text by putting \n\n at the point size.
'''
if len(txt) > size and size > 2:
if len(txt) > size and size > 3:
size -= 2
ans = []
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
for part in split_utf8(txt, size):
idx = part.rfind(b'.')
if idx == -1:
part += b'\n\n'