diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index b2f4cc9bf0..0add461b14 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -17,7 +17,7 @@ from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.conversion.preprocess import DocAnalysis from calibre.utils.cleantext import clean_ascii_chars -from polyglot.builtins import iteritems, unicode_type, map, range, long_type +from polyglot.builtins import iteritems, unicode_type, map, range HTML_TEMPLATE = '%s \n%s\n' @@ -53,7 +53,7 @@ def clean_txt(txt): def split_txt(txt, epub_split_size_kb=0): ''' Ensure there are split points for converting - to EPUB. A misdetected paragraph type can + to EPUB. A mis-detected paragraph type can result in the entire document being one giant paragraph. In this case the EPUB parser will not be able to determine where to split the file @@ -64,16 +64,14 @@ def split_txt(txt, epub_split_size_kb=0): if epub_split_size_kb > 0: if isinstance(txt, unicode_type): txt = txt.encode('utf-8') - length_byte = len(txt) - # Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin) - chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2)) - # if there are chunks with a superior size then go and break - parts = txt.split(b'\n\n') - lengths = tuple(map(len, parts)) - if lengths and max(lengths) > chunk_size: - txt = b'\n\n'.join([ - split_string_separator(line, chunk_size) for line in parts - ]) + if len(txt) > epub_split_size_kb * 1024: + chunk_size = max(16, epub_split_size_kb - 32) * 1024 + # if there are chunks with a superior size then go and break + parts = txt.split(b'\n\n') + if parts and max(map(len, parts)) > chunk_size: + txt = b'\n\n'.join( + split_string_separator(line, chunk_size) for line in parts + ) if isbytestring(txt): txt = txt.decode('utf-8') @@ -242,15 +240,15 @@ def split_string_separator(txt, size): ''' if len(txt) > size and size > 2: size -= 2 - txt = [] + ans = [] for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)): idx = part.rfind(b'.') if idx == -1: part += b'\n\n' else: part = part[:idx + 1] + b'\n\n' + part[idx:] - txt.append(part) - txt = b''.join(txt) + ans.append(part) + txt = b''.join(ans) return txt