mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Fix rare failure to convert some large TXT files with non-ascii text. Fixes #1967828 [Turning on "Remove indents at beginning of lines" may cause error in converting](https://bugs.launchpad.net/calibre/+bug/1967828)
This commit is contained in:
parent
07480ba07c
commit
57fe3de98e
@ -231,14 +231,28 @@ def opf_writer(path, opf_name, manifest, spine, mi):
|
||||
opf.render(opffile)
|
||||
|
||||
|
||||
def split_utf8(s, n):
|
||||
"""Split UTF-8 s into chunks of maximum length n."""
|
||||
if n < 3:
|
||||
raise ValueError(f'Cannot split into chunks of less than {n} < 4 bytes')
|
||||
s = memoryview(s)
|
||||
while len(s) > n:
|
||||
k = n
|
||||
while (s[k] & 0xc0) == 0x80:
|
||||
k -= 1
|
||||
yield bytes(s[:k])
|
||||
s = s[k:]
|
||||
yield bytes(s)
|
||||
|
||||
|
||||
def split_string_separator(txt, size):
|
||||
'''
|
||||
Splits the text by putting \n\n at the point size.
|
||||
'''
|
||||
if len(txt) > size and size > 2:
|
||||
if len(txt) > size and size > 3:
|
||||
size -= 2
|
||||
ans = []
|
||||
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
|
||||
for part in split_utf8(txt, size):
|
||||
idx = part.rfind(b'.')
|
||||
if idx == -1:
|
||||
part += b'\n\n'
|
||||
|
Loading…
x
Reference in New Issue
Block a user