mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
TXT Input: Fix incorrect splitting of txt with large chunks causing some text to be dropped. Fixes #1895888 [txt to epub missing lines](https://bugs.launchpad.net/calibre/+bug/1895888)
This commit is contained in:
parent
7a4b3f61ff
commit
9ec29bb548
@ -17,7 +17,7 @@ from calibre.ebooks.metadata.opf2 import OPFCreator
|
|||||||
|
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from polyglot.builtins import iteritems, unicode_type, map, range, long_type
|
from polyglot.builtins import iteritems, unicode_type, map, range
|
||||||
|
|
||||||
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
|
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
|
||||||
|
|
||||||
@ -53,7 +53,7 @@ def clean_txt(txt):
|
|||||||
def split_txt(txt, epub_split_size_kb=0):
|
def split_txt(txt, epub_split_size_kb=0):
|
||||||
'''
|
'''
|
||||||
Ensure there are split points for converting
|
Ensure there are split points for converting
|
||||||
to EPUB. A misdetected paragraph type can
|
to EPUB. A mis-detected paragraph type can
|
||||||
result in the entire document being one giant
|
result in the entire document being one giant
|
||||||
paragraph. In this case the EPUB parser will not
|
paragraph. In this case the EPUB parser will not
|
||||||
be able to determine where to split the file
|
be able to determine where to split the file
|
||||||
@ -64,16 +64,14 @@ def split_txt(txt, epub_split_size_kb=0):
|
|||||||
if epub_split_size_kb > 0:
|
if epub_split_size_kb > 0:
|
||||||
if isinstance(txt, unicode_type):
|
if isinstance(txt, unicode_type):
|
||||||
txt = txt.encode('utf-8')
|
txt = txt.encode('utf-8')
|
||||||
length_byte = len(txt)
|
if len(txt) > epub_split_size_kb * 1024:
|
||||||
# Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
|
chunk_size = max(16, epub_split_size_kb - 32) * 1024
|
||||||
chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
|
# if there are chunks with a superior size then go and break
|
||||||
# if there are chunks with a superior size then go and break
|
parts = txt.split(b'\n\n')
|
||||||
parts = txt.split(b'\n\n')
|
if parts and max(map(len, parts)) > chunk_size:
|
||||||
lengths = tuple(map(len, parts))
|
txt = b'\n\n'.join(
|
||||||
if lengths and max(lengths) > chunk_size:
|
split_string_separator(line, chunk_size) for line in parts
|
||||||
txt = b'\n\n'.join([
|
)
|
||||||
split_string_separator(line, chunk_size) for line in parts
|
|
||||||
])
|
|
||||||
if isbytestring(txt):
|
if isbytestring(txt):
|
||||||
txt = txt.decode('utf-8')
|
txt = txt.decode('utf-8')
|
||||||
|
|
||||||
@ -242,15 +240,15 @@ def split_string_separator(txt, size):
|
|||||||
'''
|
'''
|
||||||
if len(txt) > size and size > 2:
|
if len(txt) > size and size > 2:
|
||||||
size -= 2
|
size -= 2
|
||||||
txt = []
|
ans = []
|
||||||
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
|
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
|
||||||
idx = part.rfind(b'.')
|
idx = part.rfind(b'.')
|
||||||
if idx == -1:
|
if idx == -1:
|
||||||
part += b'\n\n'
|
part += b'\n\n'
|
||||||
else:
|
else:
|
||||||
part = part[:idx + 1] + b'\n\n' + part[idx:]
|
part = part[:idx + 1] + b'\n\n' + part[idx:]
|
||||||
txt.append(part)
|
ans.append(part)
|
||||||
txt = b''.join(txt)
|
txt = b''.join(ans)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user