TXT Output: Markdown output, ensure separation between sections.

This commit is contained in:
John Schember 2010-12-02 07:37:38 -05:00
parent d744fb698b
commit 8032890d0f

View File

@ -28,16 +28,36 @@ class MarkdownMLizer(object):
def mlize_spine(self): def mlize_spine(self):
output = [u''] output = [u'']
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
html = unicode(etree.tostring(item.data, encoding=unicode)) html = unicode(etree.tostring(item.data, encoding=unicode))
if not self.opts.keep_links: if not self.opts.keep_links:
html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*a[^>]*>', '', html)
html = re.sub(r'<\s*/\s*a\s*>', '', html) html = re.sub(r'<\s*/\s*a\s*>', '', html)
if not self.opts.keep_image_references: if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html) html = re.sub(r'<\s*img[^>]*>', '', html)
html = re.sub(r'<\s*img\s*>', '', html) html = re.sub(r'<\s*img\s*>', '', html)
output += html2text(html)
text = html2text(html)
# Ensure the section ends with at least two new line characters.
# This is to prevent the last paragraph from a section being
# combined into the fist paragraph of the next.
end_chars = text[-4:]
# Convert all newlines to \n
end_chars = end_chars.replace('\r\n', '\n')
end_chars = end_chars.replace('\r', '\n')
end_chars = end_chars[-2:]
if not end_chars[1] == '\n':
text += '\n\n'
if end_chars[1] == '\n' and not end_chars[0] == '\n':
text += '\n'
output += text
output = u''.join(output) output = u''.join(output)
return output return output