diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py new file mode 100644 index 0000000000..116561f355 --- /dev/null +++ b/src/calibre/ebooks/txt/markdownml.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into Markdown formatted plain text +''' + +import re + +from lxml import etree + +from calibre.utils.html2text import html2text + +class MarkdownMLizer(object): + + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to Markdown formatted TXT...') + self.oeb_book = oeb_book + self.opts = opts + + return self.mlize_spine() + + def mlize_spine(self): + output = [u''] + + for item in self.oeb_book.spine: + self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) + + html = unicode(etree.tostring(item.data, encoding=unicode)) + + if not self.opts.keep_links: + html = re.sub(r'<\s*a[^>]*>', '', html) + html = re.sub(r'<\s*/\s*a\s*>', '', html) + if not self.opts.keep_image_references: + html = re.sub(r'<\s*img[^>]*>', '', html) + html = re.sub(r'<\s*img\s*>', '', html) + + text = html2text(html) + + # Ensure the section ends with at least two new line characters. + # This is to prevent the last paragraph from a section being + # combined into the fist paragraph of the next. + end_chars = text[-4:] + # Convert all newlines to \n + end_chars = end_chars.replace('\r\n', '\n') + end_chars = end_chars.replace('\r', '\n') + end_chars = end_chars[-2:] + if not end_chars[1] == '\n': + text += '\n\n' + if end_chars[1] == '\n' and not end_chars[0] == '\n': + text += '\n' + + output += text + + output = u''.join(output) + + return output diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 15db4b1974..a6369b6f0b 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -8,6 +8,7 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation +from calibre.ebooks.txt.markdownml import MarkdownMLizer from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines @@ -44,10 +45,27 @@ class TXTOutput(OutputFormatPlugin): recommended_value=False, level=OptionRecommendation.LOW, help=_('Force splitting on the max-line-length value when no space ' 'is present. Also allows max-line-length to be below the minimum')), + OptionRecommendation(name='markdown_format', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Produce Markdown formatted text.')), + OptionRecommendation(name='keep_links', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove links within the document. This is only ' \ + 'useful when paired with the markdown-format option because' \ + 'links are always removed with plain text output.')), + OptionRecommendation(name='keep_image_references', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove image references within the document. This is only ' \ + 'useful when paired with the markdown-format option because' \ + 'image references are always removed with plain text output.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - writer = TXTMLizer(log) + if opts.markdown_format: + writer = MarkdownMLizer(log) + else: + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) log.debug('\tReplacing newlines with selected type...') diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 3ecb6940f8..48c94c2543 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -35,6 +35,7 @@ BLOCK_STYLES = [ SPACE_TAGS = [ 'td', + 'br', ] class TXTMLizer(object): diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 8b2e131ec6..2fafad4b43 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -21,7 +21,7 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', - 'inline_toc']) + 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 9eae5a8115..19e4ec52a1 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -6,7 +6,7 @@ 0 0 - 400 + 477 300 @@ -27,7 +27,7 @@ - + Qt::Vertical @@ -67,6 +67,27 @@ + + + + Apply Markdown formatting to text + + + + + + + Do not remove links (<a> tags) before processing + + + + + + + Do not remove image references before processing + + +