mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Fix rare failure to convert some large TXT files with non-ascii text. Fixes #1967828 [Turning on "Remove indents at beginning of lines" may cause error in converting](https://bugs.launchpad.net/calibre/+bug/1967828)
This commit is contained in:
parent
07480ba07c
commit
57fe3de98e
@ -231,14 +231,28 @@ def opf_writer(path, opf_name, manifest, spine, mi):
|
|||||||
opf.render(opffile)
|
opf.render(opffile)
|
||||||
|
|
||||||
|
|
||||||
|
def split_utf8(s, n):
|
||||||
|
"""Split UTF-8 s into chunks of maximum length n."""
|
||||||
|
if n < 3:
|
||||||
|
raise ValueError(f'Cannot split into chunks of less than {n} < 4 bytes')
|
||||||
|
s = memoryview(s)
|
||||||
|
while len(s) > n:
|
||||||
|
k = n
|
||||||
|
while (s[k] & 0xc0) == 0x80:
|
||||||
|
k -= 1
|
||||||
|
yield bytes(s[:k])
|
||||||
|
s = s[k:]
|
||||||
|
yield bytes(s)
|
||||||
|
|
||||||
|
|
||||||
def split_string_separator(txt, size):
|
def split_string_separator(txt, size):
|
||||||
'''
|
'''
|
||||||
Splits the text by putting \n\n at the point size.
|
Splits the text by putting \n\n at the point size.
|
||||||
'''
|
'''
|
||||||
if len(txt) > size and size > 2:
|
if len(txt) > size and size > 3:
|
||||||
size -= 2
|
size -= 2
|
||||||
ans = []
|
ans = []
|
||||||
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
|
for part in split_utf8(txt, size):
|
||||||
idx = part.rfind(b'.')
|
idx = part.rfind(b'.')
|
||||||
if idx == -1:
|
if idx == -1:
|
||||||
part += b'\n\n'
|
part += b'\n\n'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user