From 88a60c1610d5ccd0c154d5aa3df08faebb9f873a Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 1 Dec 2010 18:27:33 -0500 Subject: [PATCH 1/6] TXT Output: Turn br tags into spaces. --- src/calibre/ebooks/txt/txtml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 3ecb6940f8..48c94c2543 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -35,6 +35,7 @@ BLOCK_STYLES = [ SPACE_TAGS = [ 'td', + 'br', ] class TXTMLizer(object): From 04e3ba0e812c0b2443d19a6eb6a331b94695ed56 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 1 Dec 2010 18:51:49 -0500 Subject: [PATCH 2/6] TXT Output: Basic Markdown formatted output. --- src/calibre/ebooks/txt/output.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 15db4b1974..3c0d475460 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -8,6 +8,7 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation +from calibre.ebooks.txt.markdownml import MarkdownMLizer from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines @@ -44,10 +45,17 @@ class TXTOutput(OutputFormatPlugin): recommended_value=False, level=OptionRecommendation.LOW, help=_('Force splitting on the max-line-length value when no space ' 'is present. Also allows max-line-length to be below the minimum')), + OptionRecommendation(name='markdown_format', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Produce Markdown formatted text.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - writer = TXTMLizer(log) + if opts.markdown_format: + writer = MarkdownMLizer(log) + else: + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) log.debug('\tReplacing newlines with selected type...') From 98a0970f02ed7d953085377f5b5afa69563546e3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 1 Dec 2010 20:33:52 -0500 Subject: [PATCH 3/6] Add markdownml.py. TXT Output: Remove links option to make markdown output cleaner. --- src/calibre/ebooks/txt/markdownml.py | 40 ++++++++++++++++++++++++++++ src/calibre/ebooks/txt/output.py | 5 ++++ 2 files changed, 45 insertions(+) create mode 100644 src/calibre/ebooks/txt/markdownml.py diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py new file mode 100644 index 0000000000..2ea3e7dafe --- /dev/null +++ b/src/calibre/ebooks/txt/markdownml.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into Markdown formatted plain text +''' + +import re + +from lxml import etree + +from calibre.utils.html2text import html2text + +class MarkdownMLizer(object): + + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to Markdown formatted TXT...') + self.oeb_book = oeb_book + self.opts = opts + + return self.mlize_spine() + + def mlize_spine(self): + output = [u''] + for item in self.oeb_book.spine: + self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) + html = unicode(etree.tostring(item.data, encoding=unicode)) + if self.opts.remove_links: + html = re.sub(r'<\s*a[^>]*>', '', html) + html = re.sub(r'<\s*/\s*a\s*>', '', html) + output += html2text(html) + output = u''.join(output) + + return output diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 3c0d475460..a6f52f92ca 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -48,6 +48,11 @@ class TXTOutput(OutputFormatPlugin): OptionRecommendation(name='markdown_format', recommended_value=False, level=OptionRecommendation.LOW, help=_('Produce Markdown formatted text.')), + OptionRecommendation(name='remove_links', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Remove links within the document. This is only ' \ + 'useful when paried with the markdown-format option because' \ + 'links are removed with plain text output.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): From f078aceb6cf5f9b1a1c589303ca6f24d5adb0d71 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 1 Dec 2010 21:03:54 -0500 Subject: [PATCH 4/6] TXT Output: Add GUI support for new markdown related options. --- src/calibre/gui2/convert/txt_output.py | 2 +- src/calibre/gui2/convert/txt_output.ui | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 8b2e131ec6..3d35151bb8 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -21,7 +21,7 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', - 'inline_toc']) + 'inline_toc', 'markdown_format', 'remove_links']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 9eae5a8115..75eac8f5fc 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -6,7 +6,7 @@ 0 0 - 400 + 470 300 @@ -27,7 +27,7 @@ - + Qt::Vertical @@ -67,6 +67,20 @@ + + + + Apply Markdown formatting to text + + + + + + + Remove links (<a> tags) before processing + + + From d744fb698b96870249a72402cc1db39d0fe81cc2 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 2 Dec 2010 06:54:36 -0500 Subject: [PATCH 5/6] TXT Output: change remove links to keep links and remove links by default with Markdown format output. TXT Output: Add keep image references option and remove images by default with Markdown format output. --- src/calibre/ebooks/txt/markdownml.py | 5 ++++- src/calibre/ebooks/txt/output.py | 13 +++++++++---- src/calibre/gui2/convert/txt_output.py | 2 +- src/calibre/gui2/convert/txt_output.ui | 15 +++++++++++---- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 2ea3e7dafe..2f7960e94c 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -31,9 +31,12 @@ class MarkdownMLizer(object): for item in self.oeb_book.spine: self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) html = unicode(etree.tostring(item.data, encoding=unicode)) - if self.opts.remove_links: + if not self.opts.keep_links: html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*/\s*a\s*>', '', html) + if not self.opts.keep_image_references: + html = re.sub(r'<\s*img[^>]*>', '', html) + html = re.sub(r'<\s*img\s*>', '', html) output += html2text(html) output = u''.join(output) diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index a6f52f92ca..a6369b6f0b 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -48,11 +48,16 @@ class TXTOutput(OutputFormatPlugin): OptionRecommendation(name='markdown_format', recommended_value=False, level=OptionRecommendation.LOW, help=_('Produce Markdown formatted text.')), - OptionRecommendation(name='remove_links', + OptionRecommendation(name='keep_links', recommended_value=False, level=OptionRecommendation.LOW, - help=_('Remove links within the document. This is only ' \ - 'useful when paried with the markdown-format option because' \ - 'links are removed with plain text output.')), + help=_('Do not remove links within the document. This is only ' \ + 'useful when paired with the markdown-format option because' \ + 'links are always removed with plain text output.')), + OptionRecommendation(name='keep_image_references', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Do not remove image references within the document. This is only ' \ + 'useful when paired with the markdown-format option because' \ + 'image references are always removed with plain text output.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 3d35151bb8..2fafad4b43 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -21,7 +21,7 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', - 'inline_toc', 'markdown_format', 'remove_links']) + 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 75eac8f5fc..19e4ec52a1 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -6,7 +6,7 @@ 0 0 - 470 + 477 300 @@ -27,7 +27,7 @@ - + Qt::Vertical @@ -75,9 +75,16 @@ - + - Remove links (<a> tags) before processing + Do not remove links (<a> tags) before processing + + + + + + + Do not remove image references before processing From 8032890d0f352034b95bdf0482fbbf34b1d9f22d Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 2 Dec 2010 07:37:38 -0500 Subject: [PATCH 6/6] TXT Output: Markdown output, ensure separation between sections. --- src/calibre/ebooks/txt/markdownml.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 2f7960e94c..116561f355 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -28,16 +28,36 @@ class MarkdownMLizer(object): def mlize_spine(self): output = [u''] + for item in self.oeb_book.spine: self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) + html = unicode(etree.tostring(item.data, encoding=unicode)) + if not self.opts.keep_links: html = re.sub(r'<\s*a[^>]*>', '', html) html = re.sub(r'<\s*/\s*a\s*>', '', html) if not self.opts.keep_image_references: html = re.sub(r'<\s*img[^>]*>', '', html) html = re.sub(r'<\s*img\s*>', '', html) - output += html2text(html) + + text = html2text(html) + + # Ensure the section ends with at least two new line characters. + # This is to prevent the last paragraph from a section being + # combined into the fist paragraph of the next. + end_chars = text[-4:] + # Convert all newlines to \n + end_chars = end_chars.replace('\r\n', '\n') + end_chars = end_chars.replace('\r', '\n') + end_chars = end_chars[-2:] + if not end_chars[1] == '\n': + text += '\n\n' + if end_chars[1] == '\n' and not end_chars[0] == '\n': + text += '\n' + + output += text + output = u''.join(output) return output