diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index c53d630ed6..a12e8a0761 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -4,10 +4,9 @@ Read content from txt file. ''' -import os -import re +import os, re -from calibre import prepare_string_for_xml +from calibre import prepare_string_for_xml, isbytestring from calibre.ebooks.markdown import markdown from calibre.ebooks.metadata.opf2 import OPFCreator @@ -18,6 +17,8 @@ __docformat__ = 'restructuredtext en' HTML_TEMPLATE = u'%s\n%s\n' def convert_basic(txt, title='', epub_split_size_kb=0): + if isbytestring(txt): + txt = txt.decode('utf-8', 'replace') # Strip whitespace from the beginning and end of the line. Also replace # all line breaks with \n. txt = '\n'.join([line.strip() for line in txt.splitlines()]) @@ -30,23 +31,32 @@ def convert_basic(txt, title='', epub_split_size_kb=0): txt = re.sub('(?<=.)\s+$', '', txt) # Remove excessive line breaks. txt = re.sub('\n{3,}', '\n\n', txt) - + #remove ASCII invalid chars : 0 to 8 and 11-14 to 24 + chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) + illegal_chars = re.compile(u'|'.join(map(unichr, chars))) + txt = illegal_chars.sub('', txt) #Takes care if there is no point to split if epub_split_size_kb > 0: - length_byte = len(txt.encode('utf-8')) + if isinstance(txt, unicode): + txt = txt.encode('utf-8') + length_byte = len(txt) #Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin) chunk_size = long(length_byte / (int(length_byte / (epub_split_size_kb * 1024) ) + 2 )) #if there are chunks with a superior size then go and break - if (len(filter(lambda x: len(x.encode('utf-8')) > chunk_size, txt.split('\n\n')))) : - txt = u'\n\n'.join([split_string_separator(line, chunk_size) for line in txt.split('\n\n')]) + if (len(filter(lambda x: len(x) > chunk_size, txt.split('\n\n')))) : + txt = '\n\n'.join([split_string_separator(line, chunk_size) + for line in txt.split('\n\n')]) + if isbytestring(txt): + txt = txt.decode('utf-8') + lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): if line.strip(): - lines.append('

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + lines.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) - return HTML_TEMPLATE % (title, '\n'.join(lines)) + return HTML_TEMPLATE % (title, u'\n'.join(lines)) def convert_markdown(txt, title='', disable_toc=False): md = markdown.Markdown( @@ -58,11 +68,11 @@ def convert_markdown(txt, title='', disable_toc=False): def separate_paragraphs_single_line(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') - txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt) + txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) return txt def separate_paragraphs_print_formatted(txt): - txt = re.sub('(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt) + txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt) return txt def preserve_spaces(txt): @@ -78,9 +88,9 @@ def opf_writer(path, opf_name, manifest, spine, mi): opf.render(opffile) def split_string_separator(txt, size) : - if len(txt.encode('utf-8')) > size: - txt = u''.join([re.sub(u'\.(?P[^.]*)$', u'.\n\n\g', + if len(txt) > size: + txt = ''.join([re.sub(u'\.(?P[^.]*)$', '.\n\n\g', txt[i:i+size], 1) for i in - xrange(0, len(txt.encode('utf-8')), size)]) + xrange(0, len(txt), size)]) return txt