TXT Output: Preserve as much formatting as possible when generating Markdown output

This commit is contained in:
Kovid Goyal 2011-09-03 14:47:17 -06:00
commit e7fb1497f7

View File

@ -1,61 +1,231 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '''2011, John Schember <john@nachtimwald.com>
2011, Leigh Parry <leighparry@blueyonder.co.uk>'''
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' '''
Transform OEB content into Markdown formatted plain text Transform OEB content into Textile formatted plain text
''' '''
import re import re
from lxml import etree from functools import partial
from calibre.utils.html2text import html2text from calibre.ebooks.htmlz.oeb2html import OEB2HTML
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ebooks.txt.unsmarten import unsmarten
class MarkdownMLizer(object): class MarkdownMLizer(OEB2HTML):
def __init__(self, log):
self.log = log
def extract_content(self, oeb_book, opts): def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to Markdown formatted TXT...') self.log.info('Converting XHTML to Markdown formatted TXT...')
self.oeb_book = oeb_book
self.opts = opts self.opts = opts
self.in_pre = False
self.list = []
self.blockquotes = 0
self.remove_space_after_newline = False
self.base_hrefs = [item.href for item in oeb_book.spine]
self.map_resources(oeb_book)
return self.mlize_spine() self.style_bold = False
self.style_italic = False
def mlize_spine(self): txt = self.mlize_spine(oeb_book)
if self.opts.unsmarten_punctuation:
txt = unsmarten(txt)
# Do some tidying up
txt = self.tidy_up(txt)
return txt
def mlize_spine(self, oeb_book):
output = [u''] output = [u'']
for item in oeb_book.spine:
for item in self.oeb_book.spine:
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output.append('\n\n')
return ''.join(output)
html = unicode(etree.tostring(item.data, encoding=unicode)) def tidy_up(self, text):
# Remove blank space form beginning of paragraph.
text = re.sub('(?msu)^[ ]{1,3}', '', text)
# Remove spaces from blank lines.
text = re.sub('(?msu)^[ ]+$', '', text)
# Reduce blank lines
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
# Remove blank lines at beginning and end of document.
text = re.sub('^\s*', '', text)
text = re.sub('\s*$', '\n\n', text)
if not self.opts.keep_links: return text
html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
if not self.opts.keep_image_references:
html = re.sub(r'<\s*img[^>]*>', '', html)
text = html2text(html) def remove_newlines(self, text):
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
# Condense redundant spaces created by replacing newlines with spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
text = re.sub(r'\t+', '', text)
if self.remove_space_after_newline == True:
text = re.sub(r'^ +', '', text)
self.remove_space_after_newline = False
return text
# Ensure the section ends with at least two new line characters. def prepare_string_for_markdown(self, txt):
# This is to prevent the last paragraph from a section being txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt)
# combined into the fist paragraph of the next. return txt
end_chars = text[-4:]
# Convert all newlines to \n
end_chars = end_chars.replace('\r\n', '\n')
end_chars = end_chars.replace('\r', '\n')
end_chars = end_chars[-2:]
if not end_chars[1] == '\n':
text += '\n\n'
if end_chars[1] == '\n' and not end_chars[0] == '\n':
text += '\n'
output += text def dump_text(self, elem, stylizer):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
output = u''.join(output) # We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS:
p = elem.getparent()
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
and elem.tail:
return [elem.tail]
return ['']
return output # Setup our variables.
text = ['']
style = stylizer.style(elem)
tags = []
tag = barename(elem.tag)
attribs = elem.attrib
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
return ['']
# Soft scene breaks.
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
if ems >= 1:
text.append(u'\n\n' * ems)
bq = '> ' * self.blockquotes
# Block level elements
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
h_tag = ''
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
h_tag = '#' * int(tag[1]) + ' '
text.append('\n' + bq + h_tag)
tags.append('\n')
self.remove_space_after_newline = True
if style['font-style'] == 'italic' or tag in ('i', 'em'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
if self.style_italic == False:
text.append('*')
tags.append('*')
self.style_italic = True
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
if self.style_bold == False:
text.append('**')
tags.append('**')
self.style_bold = True
if tag == 'br':
text.append(' \n')
self.remove_space_after_newline = True
if tag == 'blockquote':
self.blockquotes += 1
tags.append('>')
text.append('> ' * self.blockquotes)
elif tag in ('code', 'pre'):
self.in_pre = True
text.append(' ')
elif tag == 'hr':
text.append('\n* * *')
tags.append('\n')
elif tag == 'a':
# Only write links with absolute (external) urls.
if attribs.has_key('href') and '://' in attribs['href']:
title = ''
if attribs.has_key('title'):
title = ' "' + attribs['title'] + '" '
text.append('[')
tags.append('](' + attribs['href'] + title + ')')
elif tag == 'img':
if self.opts.keep_image_references:
txt = '!'
if attribs.has_key('alt'):
txt += '[' + attribs['alt'] + ']'
txt += '(' + attribs['src'] + ')'
text.append(txt)
elif tag in ('ol', 'ul'):
self.list.append({'name': tag, 'num': 0})
elif tag == 'li':
if self.list:
li = self.list[-1]
else:
li = {'name': 'ul', 'num': 0}
text.append('\n')
text.append(bq)
if li['name'] == 'ul':
text.append('+ ')
elif li['name'] == 'ol':
text.append(unicode(len(self.list)) + '. ')
tags.append('')
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
txt = elem.text
if not self.in_pre:
txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
text.append(txt)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer)
# Close all open tags.
tags.reverse()
for t in tags:
if t in ('pre', 'ul', 'ol', 'li', '>', 'block'):
if t == 'pre':
self.in_pre = False
elif t == '>':
self.blockquotes -= 1
elif t == 'block':
if self.style_bold:
text.append('**')
if self.style_italic:
text.append('*')
elif t in ('ul', 'ol'):
if self.list:
self.list.pop()
if not self.list:
text.append('\n')
else:
if t == '**':
self.style_bold = False
elif t == '*':
self.style_italic = False
text.append('%s' % t)
# Soft scene breaks.
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
if ems >= 1:
text.append(u'\n\n' * ems)
# Add the text that is outside of the tag.
if hasattr(elem, 'tail') and elem.tail:
tail = elem.tail
if not self.in_pre:
tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
text.append(tail)
return text