TXT Input: Fix incorrect splitting of txt with large chunks causing some text to be dropped. Fixes #1895888 [txt to epub missing lines](https://bugs.launchpad.net/calibre/+bug/1895888)

This commit is contained in:
Kovid Goyal 2020-09-17 14:05:40 +05:30
parent 7a4b3f61ff
commit 9ec29bb548
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -17,7 +17,7 @@ from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.conversion.preprocess import DocAnalysis from calibre.ebooks.conversion.preprocess import DocAnalysis
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from polyglot.builtins import iteritems, unicode_type, map, range, long_type from polyglot.builtins import iteritems, unicode_type, map, range
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>' HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
@ -53,7 +53,7 @@ def clean_txt(txt):
def split_txt(txt, epub_split_size_kb=0): def split_txt(txt, epub_split_size_kb=0):
''' '''
Ensure there are split points for converting Ensure there are split points for converting
to EPUB. A misdetected paragraph type can to EPUB. A mis-detected paragraph type can
result in the entire document being one giant result in the entire document being one giant
paragraph. In this case the EPUB parser will not paragraph. In this case the EPUB parser will not
be able to determine where to split the file be able to determine where to split the file
@ -64,16 +64,14 @@ def split_txt(txt, epub_split_size_kb=0):
if epub_split_size_kb > 0: if epub_split_size_kb > 0:
if isinstance(txt, unicode_type): if isinstance(txt, unicode_type):
txt = txt.encode('utf-8') txt = txt.encode('utf-8')
length_byte = len(txt) if len(txt) > epub_split_size_kb * 1024:
# Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin) chunk_size = max(16, epub_split_size_kb - 32) * 1024
chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
# if there are chunks with a superior size then go and break # if there are chunks with a superior size then go and break
parts = txt.split(b'\n\n') parts = txt.split(b'\n\n')
lengths = tuple(map(len, parts)) if parts and max(map(len, parts)) > chunk_size:
if lengths and max(lengths) > chunk_size: txt = b'\n\n'.join(
txt = b'\n\n'.join([
split_string_separator(line, chunk_size) for line in parts split_string_separator(line, chunk_size) for line in parts
]) )
if isbytestring(txt): if isbytestring(txt):
txt = txt.decode('utf-8') txt = txt.decode('utf-8')
@ -242,15 +240,15 @@ def split_string_separator(txt, size):
''' '''
if len(txt) > size and size > 2: if len(txt) > size and size > 2:
size -= 2 size -= 2
txt = [] ans = []
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)): for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
idx = part.rfind(b'.') idx = part.rfind(b'.')
if idx == -1: if idx == -1:
part += b'\n\n' part += b'\n\n'
else: else:
part = part[:idx + 1] + b'\n\n' + part[idx:] part = part[:idx + 1] + b'\n\n' + part[idx:]
txt.append(part) ans.append(part)
txt = b''.join(txt) txt = b''.join(ans)
return txt return txt