Correct splitting problem with text entries

This commit is contained in:
Sengian 2010-06-28 23:13:13 +02:00
parent 8525faf60c
commit 8ea26f96cf
2 changed files with 21 additions and 8 deletions

View File

@ -62,6 +62,9 @@ class TXTInput(InputFormatPlugin):
except RuntimeError: except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be' raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
elif hasattr(options, 'flow_size') :
#Take care of possible split problems in epub output
html = convert_basic(txt, epub_split_size_kb = options.flow_size)
else : else :
html = convert_basic(txt) html = convert_basic(txt)

View File

@ -17,13 +17,10 @@ __docformat__ = 'restructuredtext en'
HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>' HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def convert_basic(txt, title=''): def convert_basic(txt, title='', epub_split_size_kb = 0):
lines = []
# Strip whitespace from the beginning and end of the line. Also replace # Strip whitespace from the beginning and end of the line. Also replace
# all line breaks with \n. # all line breaks with \n.
for line in txt.splitlines(): txt = '\n'.join([line.strip() for line in txt.splitlines()])
lines.append(line.strip())
txt = '\n'.join(lines)
# Condense redundant spaces # Condense redundant spaces
txt = re.sub('[ ]{2,}', ' ', txt) txt = re.sub('[ ]{2,}', ' ', txt)
@ -34,6 +31,15 @@ def convert_basic(txt, title=''):
# Remove excessive line breaks. # Remove excessive line breaks.
txt = re.sub('\n{3,}', '\n\n', txt) txt = re.sub('\n{3,}', '\n\n', txt)
#Takes care if there is no point to split
if ( epub_split_size_kb ) :
length_byte = len(txt.encode('utf-8'))
#Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
chunk_size = long(length_byte / (int(length_byte / (epub_split_size_kb * 1024) ) + 2 ))
#if there are chunks with a superior size then go and break
if (len(filter(lambda x: len(x.encode('utf-8')) > chunk_size, txt.split('\n\n')))) :
txt = '\n\n'.join([split_string_separator(line, chunk_size) for line in txt.split('\n\n')])
lines = [] lines = []
# Split into paragraphs based on having a blank line between text. # Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'): for line in txt.split('\n\n'):
@ -50,8 +56,7 @@ def convert_markdown(txt, title='', disable_toc=False):
return HTML_TEMPLATE % (title, md.convert(txt)) return HTML_TEMPLATE % (title, md.convert(txt))
def separate_paragraphs_single_line(txt): def separate_paragraphs_single_line(txt):
txt = txt.replace('\r\n', '\n') txt = txt.replace('(\r\n|\r)', '\n')
txt = txt.replace('\r', '\n')
txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt) txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt)
return txt return txt
@ -71,3 +76,8 @@ def opf_writer(path, opf_name, manifest, spine, mi):
with open(os.path.join(path, opf_name), 'wb') as opffile: with open(os.path.join(path, opf_name), 'wb') as opffile:
opf.render(opffile) opf.render(opffile)
def split_string_separator(txt, size) :
if ( len(txt.encode('utf-8')) > size ) :
txt = ''.join([ re.sub(u'\.(?P<ends>[^.]*)$', u'.\n\n\g<ends>', txt[i:i+size], 1) for i in xrange(0,len(txt.encode('utf-8')),size) ])
return txt