mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	TXT Input: Fix incorrect splitting of txt with large chunks causing some text to be dropped. Fixes #1895888 [txt to epub missing lines](https://bugs.launchpad.net/calibre/+bug/1895888)
This commit is contained in:
		
							parent
							
								
									7a4b3f61ff
								
							
						
					
					
						commit
						9ec29bb548
					
				@ -17,7 +17,7 @@ from calibre.ebooks.metadata.opf2 import OPFCreator
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
 | 
					from calibre.ebooks.conversion.preprocess import DocAnalysis
 | 
				
			||||||
from calibre.utils.cleantext import clean_ascii_chars
 | 
					from calibre.utils.cleantext import clean_ascii_chars
 | 
				
			||||||
from polyglot.builtins import iteritems, unicode_type, map, range, long_type
 | 
					from polyglot.builtins import iteritems, unicode_type, map, range
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
 | 
					HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -53,7 +53,7 @@ def clean_txt(txt):
 | 
				
			|||||||
def split_txt(txt, epub_split_size_kb=0):
 | 
					def split_txt(txt, epub_split_size_kb=0):
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    Ensure there are split points for converting
 | 
					    Ensure there are split points for converting
 | 
				
			||||||
    to EPUB. A misdetected paragraph type can
 | 
					    to EPUB. A mis-detected paragraph type can
 | 
				
			||||||
    result in the entire document being one giant
 | 
					    result in the entire document being one giant
 | 
				
			||||||
    paragraph. In this case the EPUB parser will not
 | 
					    paragraph. In this case the EPUB parser will not
 | 
				
			||||||
    be able to determine where to split the file
 | 
					    be able to determine where to split the file
 | 
				
			||||||
@ -64,16 +64,14 @@ def split_txt(txt, epub_split_size_kb=0):
 | 
				
			|||||||
    if epub_split_size_kb > 0:
 | 
					    if epub_split_size_kb > 0:
 | 
				
			||||||
        if isinstance(txt, unicode_type):
 | 
					        if isinstance(txt, unicode_type):
 | 
				
			||||||
            txt = txt.encode('utf-8')
 | 
					            txt = txt.encode('utf-8')
 | 
				
			||||||
        length_byte = len(txt)
 | 
					        if len(txt) > epub_split_size_kb * 1024:
 | 
				
			||||||
        # Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
 | 
					            chunk_size = max(16, epub_split_size_kb - 32) * 1024
 | 
				
			||||||
        chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
 | 
					            # if there are chunks with a superior size then go and break
 | 
				
			||||||
        # if there are chunks with a superior size then go and break
 | 
					            parts = txt.split(b'\n\n')
 | 
				
			||||||
        parts = txt.split(b'\n\n')
 | 
					            if parts and max(map(len, parts)) > chunk_size:
 | 
				
			||||||
        lengths = tuple(map(len, parts))
 | 
					                txt = b'\n\n'.join(
 | 
				
			||||||
        if lengths and max(lengths) > chunk_size:
 | 
					                    split_string_separator(line, chunk_size) for line in parts
 | 
				
			||||||
            txt = b'\n\n'.join([
 | 
					                )
 | 
				
			||||||
                split_string_separator(line, chunk_size) for line in parts
 | 
					 | 
				
			||||||
            ])
 | 
					 | 
				
			||||||
    if isbytestring(txt):
 | 
					    if isbytestring(txt):
 | 
				
			||||||
        txt = txt.decode('utf-8')
 | 
					        txt = txt.decode('utf-8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -242,15 +240,15 @@ def split_string_separator(txt, size):
 | 
				
			|||||||
    '''
 | 
					    '''
 | 
				
			||||||
    if len(txt) > size and size > 2:
 | 
					    if len(txt) > size and size > 2:
 | 
				
			||||||
        size -= 2
 | 
					        size -= 2
 | 
				
			||||||
        txt = []
 | 
					        ans = []
 | 
				
			||||||
        for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
 | 
					        for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
 | 
				
			||||||
            idx = part.rfind(b'.')
 | 
					            idx = part.rfind(b'.')
 | 
				
			||||||
            if idx == -1:
 | 
					            if idx == -1:
 | 
				
			||||||
                part += b'\n\n'
 | 
					                part += b'\n\n'
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                part = part[:idx + 1] + b'\n\n' + part[idx:]
 | 
					                part = part[:idx + 1] + b'\n\n' + part[idx:]
 | 
				
			||||||
            txt.append(part)
 | 
					            ans.append(part)
 | 
				
			||||||
        txt = b''.join(txt)
 | 
					        txt = b''.join(ans)
 | 
				
			||||||
    return txt
 | 
					    return txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user