diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 29b3d899bc..b73a6e8908 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -8,7 +8,6 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation -from calibre.ebooks.txt.markdownml import MarkdownMLizer from calibre.ebooks.txt.txtml import TXTMLizer from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines @@ -44,24 +43,32 @@ class TXTOutput(OutputFormatPlugin): recommended_value=False, level=OptionRecommendation.LOW, help=_('Force splitting on the max-line-length value when no space ' 'is present. Also allows max-line-length to be below the minimum')), - OptionRecommendation(name='markdown_format', - recommended_value=False, level=OptionRecommendation.LOW, - help=_('Produce Markdown formatted text.')), + OptionRecommendation(name='txt_output_formatting', + recommended_value='plain', + choices=['plain', 'markdown', 'textile'], + help=_('Formatting used within the document.\n' + '* plain: Produce plain text.\n' + '* markdown: Produce Markdown formatted text.\n' + '* textile: Produce Textile formatted text.')), OptionRecommendation(name='keep_links', recommended_value=False, level=OptionRecommendation.LOW, help=_('Do not remove links within the document. This is only ' \ - 'useful when paired with the markdown-format option because' \ - ' links are always removed with plain text output.')), + 'useful when paired with a txt-output-formatting option that ' + 'is not none because links are always removed with plain text output.')), OptionRecommendation(name='keep_image_references', recommended_value=False, level=OptionRecommendation.LOW, help=_('Do not remove image references within the document. This is only ' \ - 'useful when paired with the markdown-format option because' \ - ' image references are always removed with plain text output.')), + 'useful when paired with a txt-output-formatting option that ' + 'is not none because links are always removed with plain text output.')), ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - if opts.markdown_format: + if opts.txt_output_formatting.lower() == 'markdown': + from calibre.ebooks.txt.markdownml import MarkdownMLizer writer = MarkdownMLizer(log) + elif opts.txt_output_formatting.lower() == 'textile': + from calibre.ebooks.txt.textileml import TextileMLizer + writer = TextileMLizer(log) else: writer = TXTMLizer(log) diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py new file mode 100644 index 0000000000..94834d8e79 --- /dev/null +++ b/src/calibre/ebooks/txt/textileml.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into Textile formatted plain text +''' + +import re + +from lxml import etree + +from calibre.ebooks.oeb.base import XHTML +from calibre.utils.html2textile import html2textile + +class TextileMLizer(object): + + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to Textile formatted TXT...') + self.oeb_book = oeb_book + self.opts = opts + + return self.mlize_spine() + + def mlize_spine(self): + output = [u''] + + for item in self.oeb_book.spine: + self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + + html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + + if not self.opts.keep_links: + html = re.sub(r'<\s*a[^>]*>', '', html) + html = re.sub(r'<\s*/\s*a\s*>', '', html) + if not self.opts.keep_image_references: + html = re.sub(r'<\s*img[^>]*>', '', html) + html = re.sub(r'<\s*img\s*>', '', html) + + text = html2textile(html) + + # Ensure the section ends with at least two new line characters. + # This is to prevent the last paragraph from a section being + # combined into the fist paragraph of the next. + end_chars = text[-4:] + # Convert all newlines to \n + end_chars = end_chars.replace('\r\n', '\n') + end_chars = end_chars.replace('\r', '\n') + end_chars = end_chars[-2:] + if not end_chars[1] == '\n': + text += '\n\n' + if end_chars[1] == '\n' and not end_chars[0] == '\n': + text += '\n' + + output += text + + output = u''.join(output) + + return output diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 0e6a6b9574..33ed64cef1 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -4,7 +4,6 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -from PyQt4.Qt import Qt from calibre.gui2.convert.txt_output_ui import Ui_Form from calibre.gui2.convert import Widget @@ -21,26 +20,14 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', - 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references', + 'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references', 'txt_output_encoding']) self.db, self.book_id = db, book_id for x in get_option('newline').option.choices: self.opt_newline.addItem(x) + for x in get_option('txt_output_formatting').option.choices: + self.opt_txt_output_formatting.addItem(x) self.initialize_options(get_option, get_help, db, book_id) - self.opt_markdown_format.stateChanged.connect(self.enable_markdown_format) - self.enable_markdown_format(self.opt_markdown_format.checkState()) - def break_cycles(self): Widget.break_cycles(self) - - try: - self.opt_markdown_format.stateChanged.disconnect() - except: - pass - - def enable_markdown_format(self, state): - state = state == Qt.Checked - self.opt_keep_links.setEnabled(state) - self.opt_keep_image_references.setEnabled(state) - diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 57fe702db7..1ef9e6e6b9 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -6,100 +6,123 @@ 0 0 - 477 - 300 + 392 + 346 Form - - - - - &Line ending style: - - - opt_newline + + + + + General + + + + + Output &Encoding: + + + opt_txt_output_encoding + + + + + + + true + + + + + + + &Line ending style: + + + opt_newline + + + + + + + + + + &Formatting: + + + opt_txt_output_formatting + + + + + + + - - - - - - - Qt::Vertical - - - - 20 - 246 - - - - - - - - &Inline TOC + + + + Plain + + + + + &Maximum line length: + + + opt_max_line_length + + + + + + + + + + Force maximum line length + + + + + + + &Inline TOC + + + + - - - - - - - &Maximum line length: - - - opt_max_line_length - - - - - - - Force maximum line length - - - - - - - Apply Markdown formatting to text - - - - - - - Do not remove links (<a> tags) before processing - - - - - - - Do not remove image references before processing - - - - - - - Output Encoding: - - - - - - - true + + + + Markdown, Textile + + + + + Do not remove links (<a> tags) before processing + + + + + + + Do not remove image references before processing + + + + diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py new file mode 100644 index 0000000000..82797a81ad --- /dev/null +++ b/src/calibre/utils/html2textile.py @@ -0,0 +1,209 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2010, Webreactor - Marcin Lulek +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from lxml import etree +from calibre.ebooks.oeb.base import barename + +class EchoTarget: + + def __init__(self): + self.final_output = [] + self.block = False + self.ol_ident = 0 + self.ul_ident = 0 + self.list_types = [] + self.haystack = [] + + def start(self, tag, attrib): + tag = barename(tag) + + newline = '\n' + dot = '' + new_tag = '' + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): + new_tag = tag + dot = '. ' + elif tag == 'p': + new_tag = '' + dot = '' + elif tag == 'blockquote': + new_tag = 'bq' + dot = '. ' + elif tag in ('b', 'strong'): + new_tag = '*' + newline = '' + elif tag in ('em', 'i'): + new_tag = '_' + newline = '' + elif tag == 'cite': + new_tag = '??' + newline = '' + elif tag == 'del': + new_tag = '-' + newline = '' + elif tag == 'ins': + new_tag = '+' + newline = '' + elif tag == 'sup': + new_tag = '^' + newline = '' + elif tag == 'sub': + new_tag = '~' + newline = '' + elif tag == 'span': + new_tag = '%' + newline = '' + elif tag == 'a': + self.block = True + if 'title' in attrib: + self.a_part = {'title':attrib.get('title'), + 'href':attrib.get('href', '')} + else: + self.a_part = {'title':None, 'href':attrib.get('href', '')} + new_tag = '' + newline = '' + + elif tag == 'img': + if 'alt' in attrib: + new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),) + else: + new_tag = ' !%s' % attrib.get('src') + newline = '' + + elif tag in ('ul', 'ol'): + new_tag = '' + newline = '' + self.list_types.append(tag) + if tag == 'ul': + self.ul_ident += 1 + else: + self.ol_ident += 1 + + elif tag == 'li': + indent = self.ul_ident + self.ol_ident + if self.list_types[-1] == 'ul': + new_tag = '*' * indent + ' ' + newline = '\n' + else: + new_tag = '#' * indent + ' ' + newline = '\n' + + + if tag not in ('ul', 'ol'): + textile = '%(newline)s%(tag)s%(dot)s' % \ + { + 'newline':newline, + 'tag':new_tag, + 'dot':dot + } + if not self.block: + self.final_output.append(textile) + else: + self.haystack.append(textile) + + def end(self, tag): + tag = barename(tag) + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'): + self.final_output.append('\n') + elif tag in ('b', 'strong'): + self.final_output.append('*') + elif tag in ('em', 'i'): + self.final_output.append('_') + elif tag == 'cite': + self.final_output.append('??') + elif tag == 'del': + self.final_output.append('-') + elif tag == 'ins': + self.final_output.append('+') + elif tag == 'sup': + self.final_output.append('^') + elif tag == 'sub': + self.final_output.append('~') + elif tag == 'span': + self.final_output.append('%') + elif tag == 'a': + if self.a_part['title']: + textilized = ' "%s (%s)":%s ' % ( + ''.join(self.haystack), + self.a_part.get('title'), + self.a_part.get('href'), + ) + self.haystack = [] + else: + textilized = ' "%s":%s ' % ( + ''.join(self.haystack), + self.a_part.get('href'), + ) + self.haystack = [] + self.final_output.append(textilized) + self.block = False + elif tag == 'img': + self.final_output.append('!') + elif tag == 'ul': + self.ul_ident -= 1 + self.list_types.pop() + if len(self.list_types) == 0: + self.final_output.append('\n') + elif tag == 'ol': + self.ol_ident -= 1 + self.list_types.pop() + if len(self.list_types) == 0: + self.final_output.append('\n') + + def data(self, data): + #we dont want any linebreaks inside our tags + node_data = data.replace('\n','') + if not self.block: + self.final_output.append(node_data) + else: + self.haystack.append(node_data) + + def comment(self, text): + pass + + def close(self): + return "closed!" + + +def html2textile(html): + #1st pass + #clean the whitespace and convert html to xhtml + parser = etree.HTMLParser() + tree = etree.fromstring(html, parser) + xhtml = etree.tostring(tree, method="xml") + parser = etree.XMLParser(remove_blank_text=True) + root = etree.XML(xhtml, parser) + cleaned_html = etree.tostring(root) + #2nd pass build textile + target = EchoTarget() + parser = etree.XMLParser(target=target) + root = etree.fromstring(cleaned_html, parser) + textilized_text = ''.join(target.final_output).lstrip().rstrip() + return textilized_text