mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Remove invalid ASCII characters from plain text files
This commit is contained in:
parent
a80c304fa4
commit
3405615e54
@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = preserve_spaces(txt)
|
txt = preserve_spaces(txt)
|
||||||
|
|
||||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||||
|
txt = txt.encode('utf-8')
|
||||||
|
|
||||||
if options.markdown:
|
if options.markdown:
|
||||||
log.debug('Running text though markdown conversion...')
|
log.debug('Running text though markdown conversion...')
|
||||||
@ -79,7 +80,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
base = os.path.dirname(stream.name)
|
base = os.path.dirname(stream.name)
|
||||||
htmlfile = open(os.path.join(base, 'temp_calibre_txt_input_to_html.html'),
|
htmlfile = open(os.path.join(base, 'temp_calibre_txt_input_to_html.html'),
|
||||||
'wb')
|
'wb')
|
||||||
htmlfile.write(html.encode('utf-8'))
|
htmlfile.write(html) #html.encode('utf-8')
|
||||||
htmlfile.close()
|
htmlfile.close()
|
||||||
cwd = os.getcwdu()
|
cwd = os.getcwdu()
|
||||||
odi = options.debug_pipeline
|
odi = options.debug_pipeline
|
||||||
|
@ -19,7 +19,7 @@ HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html
|
|||||||
def convert_basic(txt, title='', epub_split_size_kb=0):
|
def convert_basic(txt, title='', epub_split_size_kb=0):
|
||||||
# Strip whitespace from the beginning and end of the line. Also replace
|
# Strip whitespace from the beginning and end of the line. Also replace
|
||||||
# all line breaks with \n.
|
# all line breaks with \n.
|
||||||
txt = '\n'.join([line.strip() for line in txt.splitlines()])
|
txt = u'\n'.join([line.strip() for line in txt.splitlines()])
|
||||||
|
|
||||||
# Condense redundant spaces
|
# Condense redundant spaces
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
||||||
@ -29,23 +29,28 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
txt = re.sub('(?<=.)\s+$', '', txt)
|
txt = re.sub('(?<=.)\s+$', '', txt)
|
||||||
# Remove excessive line breaks.
|
# Remove excessive line breaks.
|
||||||
txt = re.sub('\n{3,}', '\n\n', txt)
|
txt = re.sub('\n{3,}', '\n\n', txt)
|
||||||
|
#remove ASCII invalid chars : 0 to 8 and 11-14 to 24
|
||||||
|
#illegal_char = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08| \
|
||||||
|
# \x0B|\x0E|\x0F|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18')
|
||||||
|
txt = re.sub('\u000[0-8]|\u001[14-9]|\u002[0-4]', '', txt)
|
||||||
|
|
||||||
#Takes care if there is no point to split
|
#Takes care if there is no point to split
|
||||||
if epub_split_size_kb > 0:
|
if epub_split_size_kb > 0:
|
||||||
length_byte = len(txt.encode('utf-8'))
|
length_byte = len(txt)
|
||||||
#Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
|
#Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
|
||||||
chunk_size = long(length_byte / (int(length_byte / (epub_split_size_kb * 1024) ) + 2 ))
|
chunk_size = long(length_byte / (int(length_byte / (epub_split_size_kb * 1024) ) + 2 ))
|
||||||
#if there are chunks with a superior size then go and break
|
#if there are chunks with a superior size then go and break
|
||||||
if (len(filter(lambda x: len(x.encode('utf-8')) > chunk_size, txt.split('\n\n')))) :
|
if (len(filter(lambda x: len(x) > chunk_size, txt.split('\n\n')))) :
|
||||||
txt = u'\n\n'.join([split_string_separator(line, chunk_size) for line in txt.split('\n\n')])
|
txt = u'\n\n'.join([split_string_separator(line, chunk_size)
|
||||||
|
for line in txt.split('\n\n')])
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
# Split into paragraphs based on having a blank line between text.
|
# Split into paragraphs based on having a blank line between text.
|
||||||
for line in txt.split('\n\n'):
|
for line in txt.split('\n\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
lines.append('<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
lines.append(u'<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
||||||
|
|
||||||
return HTML_TEMPLATE % (title, '\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
@ -57,11 +62,11 @@ def convert_markdown(txt, title='', disable_toc=False):
|
|||||||
def separate_paragraphs_single_line(txt):
|
def separate_paragraphs_single_line(txt):
|
||||||
txt = txt.replace('\r\n', '\n')
|
txt = txt.replace('\r\n', '\n')
|
||||||
txt = txt.replace('\r', '\n')
|
txt = txt.replace('\r', '\n')
|
||||||
txt = re.sub(u'(?<=.)\n(?=.)', u'\n\n', txt)
|
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def separate_paragraphs_print_formatted(txt):
|
def separate_paragraphs_print_formatted(txt):
|
||||||
txt = re.sub('(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
|
txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def preserve_spaces(txt):
|
def preserve_spaces(txt):
|
||||||
@ -77,9 +82,9 @@ def opf_writer(path, opf_name, manifest, spine, mi):
|
|||||||
opf.render(opffile)
|
opf.render(opffile)
|
||||||
|
|
||||||
def split_string_separator(txt, size) :
|
def split_string_separator(txt, size) :
|
||||||
if len(txt.encode('utf-8')) > size:
|
if len(txt) > size:
|
||||||
txt = u''.join([re.sub(u'\.(?P<ends>[^.]*)$', u'.\n\n\g<ends>',
|
txt = u''.join([re.sub(u'\.(?P<ends>[^.]*)$', u'.\n\n\g<ends>',
|
||||||
txt[i:i+size], 1) for i in
|
txt[i:i+size], 1) for i in
|
||||||
xrange(0, len(txt.encode('utf-8')), size)])
|
xrange(0, len(txt), size)])
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user