TXT output: All new Textile output with much greater preservation of formatting from the input document

2025-07-09 03:04:10 -04:00 · 2011-05-12 17:06:05 -06:00 · 2011-05-12 17:06:05 -06:00 · 1fc639a0c9
commit 1fc639a0c9
parent dc74afe1f2 dc0834e8bc
10 changed files with 685 additions and 318 deletions
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -253,7 +253,7 @@ class OutputProfile(Plugin):
    periodical_date_in_title = True
    #: Characters used in jackets and catalogs
-	missing_char = u'x'
+    missing_char = u'x'
    ratings_char = u'*'
    empty_ratings_char = u' '
    read_char = u'+'
@ -293,38 +293,38 @@ class iPadOutput(OutputProfile):
        }
    ]
-	missing_char = u'\u2715\u200a'		# stylized 'x' plus hair space
+    missing_char = u'\u2715\u200a'      # stylized 'x' plus hair space
-    ratings_char = u'\u2605'			# filled star
+    ratings_char = u'\u2605'            # filled star
-	empty_ratings_char = u'\u2606'		# hollow star
+    empty_ratings_char = u'\u2606'      # hollow star
-    read_char = u'\u2713'				# check mark
+    read_char = u'\u2713'               # check mark
    touchscreen = True
    # touchscreen_news_css {{{
    touchscreen_news_css = u'''
-			/* hr used in articles */
+            /* hr used in articles */
-			.article_articles_list {
+            .article_articles_list {
                width:18%;
-				}
+                }
            .article_link {
-            	color: #593f29;
+                color: #593f29;
                font-style: italic;
                }
            .article_next {
-				-webkit-border-top-right-radius:4px;
+                -webkit-border-top-right-radius:4px;
-				-webkit-border-bottom-right-radius:4px;
+                -webkit-border-bottom-right-radius:4px;
                font-style: italic;
                width:32%;
                }
            .article_prev {
-				-webkit-border-top-left-radius:4px;
+                -webkit-border-top-left-radius:4px;
-				-webkit-border-bottom-left-radius:4px;
+                -webkit-border-bottom-left-radius:4px;
                font-style: italic;
                width:32%;
                }
-			.article_sections_list {
+            .article_sections_list {
                width:18%;
-				}
+                }
            .articles_link {
                font-weight: bold;
                }
@ -334,8 +334,8 @@ class iPadOutput(OutputProfile):
            .caption_divider {
-            	border:#ccc 1px solid;
+                border:#ccc 1px solid;
-				}
+                }
            .touchscreen_navbar {
                background:#c3bab2;
@ -357,50 +357,50 @@ class iPadOutput(OutputProfile):
                text-align:center;
                }
-			.touchscreen_navbar td a:link {
+            .touchscreen_navbar td a:link {
-				color: #593f29;
+                color: #593f29;
-				text-decoration: none;
+                text-decoration: none;
-				}
+                }
-			/* Index formatting */
+            /* Index formatting */
-			.publish_date {
+            .publish_date {
-				text-align:center;
+                text-align:center;
-				}
+                }
-			.divider {
+            .divider {
-				border-bottom:1em solid white;
+                border-bottom:1em solid white;
-				border-top:1px solid gray;
+                border-top:1px solid gray;
-				}
+                }
-			hr.caption_divider {
+            hr.caption_divider {
-				border-color:black;
+                border-color:black;
-				border-style:solid;
+                border-style:solid;
-				border-width:1px;
+                border-width:1px;
-				}
+                }
            /* Feed summary formatting */
            .article_summary {
-            	display:inline-block;
+                display:inline-block;
-            	}
+                }
            .feed {
                font-family:sans-serif;
                font-weight:bold;
                font-size:larger;
-				}
+                }
            .feed_link {
                font-style: italic;
                }
            .feed_next {
-				-webkit-border-top-right-radius:4px;
+                -webkit-border-top-right-radius:4px;
-				-webkit-border-bottom-right-radius:4px;
+                -webkit-border-bottom-right-radius:4px;
                font-style: italic;
                width:40%;
                }
            .feed_prev {
-				-webkit-border-top-left-radius:4px;
+                -webkit-border-top-left-radius:4px;
-				-webkit-border-bottom-left-radius:4px;
+                -webkit-border-bottom-left-radius:4px;
                font-style: italic;
                width:40%;
                }
@ -410,24 +410,24 @@ class iPadOutput(OutputProfile):
                font-size: 160%;
                }
-			.feed_up {
+            .feed_up {
                font-weight: bold;
                width:20%;
-				}
+                }
            .summary_headline {
                font-weight:bold;
                text-align:left;
-				}
+                }
            .summary_byline {
                text-align:left;
                font-family:monospace;
-				}
+                }
            .summary_text {
                text-align:left;
-				}
+                }
        '''
        # }}}
@ -617,8 +617,8 @@ class KindleOutput(OutputProfile):
    supports_mobi_indexing = True
    periodical_date_in_title = False
-	missing_char = u'x\u2009'
+    missing_char = u'x\u2009'
-	empty_ratings_char = u'\u2606'
+    empty_ratings_char = u'\u2606'
    ratings_char = u'\u2605'
    read_char = u'\u2713'
@ -642,8 +642,8 @@ class KindleDXOutput(OutputProfile):
    #comic_screen_size         = (741, 1022)
    supports_mobi_indexing = True
    periodical_date_in_title = False
-	missing_char = u'x\u2009'
+    missing_char = u'x\u2009'
-	empty_ratings_char = u'\u2606'
+    empty_ratings_char = u'\u2606'
    ratings_char = u'\u2605'
    read_char = u'\u2713'
    mobi_ems_per_blockquote = 2.0
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@ -12,7 +12,7 @@ A Humane Web Text Generator
 #__date__ = '2009/12/04'
 __copyright__ = """
-Copyright (c) 2011, Leigh Parry
+Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
 Copyright (c) 2011, John Schember <john@nachtimwald.com>
 Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
 Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
@ -219,14 +219,13 @@ class Textile(object):
    ]
    glyph_defaults = [
        (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'),                   r'\1\2&#215;\3'),                       #  dimension sign
-        (re.compile(r'(\d+)\'', re.I),                                 r'\1&#8242;'),                          #  prime
+        (re.compile(r'(\d+)\'(\s)', re.I),                             r'\1&#8242;\2'),                          #  prime
-        (re.compile(r'(\d+)\"', re.I),                                 r'\1&#8243;'),                          #  prime-double
+        (re.compile(r'(\d+)\"(\s)', re.I),                             r'\1&#8243;\2'),                          #  prime-double
        (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'),      r'<acronym title="\2">\1</acronym>'),   #  3+ uppercase acronym
        (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'),         r'<span class="caps">\1</span>'),       #  3+ uppercase
        (re.compile(r'\b(\s{0,1})?\.{3}'),                             r'\1&#8230;'),                          #  ellipsis
        (re.compile(r'^[\*_-]{3,}$', re.M),                            r'<hr />'),                             #  <hr> scene-break
-        (re.compile(r'\b--\b'),                                        r'&#8212;'),                            #  em dash
+        (re.compile(r'(^|[^-])--([^-]|$)'),                                r'\1&#8212;\2'),                        #  em dash
        (re.compile(r'(\s)--(\s)'),                                    r'\1&#8212;\2'),                        #  em dash
        (re.compile(r'\s-(?:\s|$)'),                                   r' &#8211; '),                          #  en dash
        (re.compile(r'\b( ?)[([]TM[])]', re.I),                        r'\1&#8482;'),                          #  trademark
        (re.compile(r'\b( ?)[([]R[])]', re.I),                         r'\1&#174;'),                           #  registered
@ -706,6 +705,21 @@ class Textile(object):
            result.append(line)
        return ''.join(result)
    def macros_only(self, text):
        # fix: hackish
        text = re.sub(r'"\Z', '\" ', text)
        result = []
        for line in re.compile(r'(<.*?>)', re.U).split(text):
            if not re.search(r'<.*>', line):
                rules = []
                if re.search(r'{.+?}', line):
                    rules = self.macro_defaults
                for s, r in rules:
                    line = s.sub(r, line)
            result.append(line)
        return ''.join(result)
    def vAlign(self, input):
        d = {'^':'top', '-':'middle', '~':'bottom'}
        return d.get(input, '')
@ -814,6 +828,7 @@ class Textile(object):
        'fooobar ... and hello world ...'
        """
        text = self.macros_only(text)
        punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
        pattern = r'''
@ -1044,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
    return Textile(restricted=True, lite=lite,
                   noimage=noimage).textile(text, rel='nofollow',
                                            html_type=html_type)
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -66,19 +66,26 @@ class TXTOutput(OutputFormatPlugin):
            help=_('Do not remove image references within the document. This is only ' \
            'useful when paired with a txt-output-formatting option that '
            'is not none because links are always removed with plain text output.')),
        OptionRecommendation(name='keep_color',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Do not remove font color from output. This is only useful when ' \
                   'txt-output-formatting is set to textile. Textile is the only ' \
                   'formatting that supports setting font color. If this option is ' \
                   'not specified font color will not be set and default to the ' \
                   'color displayed by the reader (generally this is black).')),
     ])
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        if opts.txt_output_formatting.lower() == 'markdown':
            from calibre.ebooks.txt.markdownml import MarkdownMLizer
-            writer = MarkdownMLizer(log)
+            self.writer = MarkdownMLizer(log)
        elif opts.txt_output_formatting.lower() == 'textile':
            from calibre.ebooks.txt.textileml import TextileMLizer
-            writer = TextileMLizer(log)
+            self.writer = TextileMLizer(log)
        else:
-            writer = TXTMLizer(log)
+            self.writer = TXTMLizer(log)
-        txt = writer.extract_content(oeb_book, opts)
+        txt = self.writer.extract_content(oeb_book, opts)
        txt = clean_ascii_chars(txt)
        log.debug('\tReplacing newlines with selected type...')
@ -111,17 +118,28 @@ class TXTZOutput(TXTOutput):
        from calibre.ebooks.oeb.base import OEB_IMAGES
        with TemporaryDirectory('_txtz_output') as tdir:
            # TXT
-            with TemporaryFile('index.txt') as tf:
+            txt_name = 'index.txt'
            if opts.txt_output_formatting.lower() == 'textile':
                txt_name = 'index.text'
            with TemporaryFile(txt_name) as tf:
                TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
-                shutil.copy(tf, os.path.join(tdir, 'index.txt'))
+                shutil.copy(tf, os.path.join(tdir, txt_name))
            # Images
            for item in oeb_book.manifest:
                if item.media_type in OEB_IMAGES:
-                    path = os.path.join(tdir, os.path.dirname(item.href))
+                    if hasattr(self.writer, 'images'):
                        path = os.path.join(tdir, 'images')
                        if item.href in self.writer.images:
                            href = self.writer.images[item.href]
                        else:
                            continue
                    else:
                        path = os.path.join(tdir, os.path.dirname(item.href))
                        href = os.path.basename(item.href)
                    if not os.path.exists(path):
                        os.makedirs(path)
-                    with open(os.path.join(tdir, item.href), 'wb') as imgf:
+                    with open(os.path.join(path, href), 'wb') as imgf:
                        imgf.write(item.data)
            # Metadata
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -242,6 +242,8 @@ def detect_formatting_type(txt):
    textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
    # Links
    textile_count += len(re.findall(r'"[^"]*":\S+', txt))
    # paragraph blocks
    textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
    # Decide if either markdown or textile is used in the text
    # based on the number of unique formatting elements found.
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@ -1,62 +1,489 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
 __docformat__ = 'restructuredtext en'
 '''
 Transform OEB content into Textile formatted plain text
 '''
 import re
-from lxml import etree
+from functools import partial
-from calibre.ebooks.oeb.base import XHTML
+from calibre.ebooks.htmlz.oeb2html import OEB2HTML
-from calibre.utils.html2textile import html2textile
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks import unit_convert
 from calibre.ebooks.txt.unsmarten import unsmarten
-class TextileMLizer(object):
+class TextileMLizer(OEB2HTML):
    def __init__(self, log):
        self.log = log
    def extract_content(self, oeb_book, opts):
        self.log.info('Converting XHTML to Textile formatted TXT...')
        self.oeb_book = oeb_book
        self.opts = opts
        self.in_pre = False
        self.in_table = False
        self.links = {}
        self.list = []
        self.our_links = []
        self.in_a_link = False
        self.our_ids = []
        self.images = {}
        self.id_no_text = u''
        self.style_embed = []
        self.remove_space_after_newline = False
        self.base_hrefs = [item.href for item in oeb_book.spine]
        self.map_resources(oeb_book)
-        return self.mlize_spine()
+        self.style_bold = False
        self.style_italic = False
        self.style_under = False
        self.style_strike = False
        self.style_smallcap = False
-    def mlize_spine(self):
+        txt = self.mlize_spine(oeb_book)
        txt = unsmarten(txt)
        # Do some tidying up
        txt = self.tidy_up(txt)
        return txt
    def mlize_spine(self, oeb_book):
        output = [u'']
-
+        for item in oeb_book.spine:
        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
            self.rewrite_ids(item.data, item)
            rewrite_links(item.data, partial(self.rewrite_link, page=item))
            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
            output.append('\n\n')
        return ''.join(output)
-            html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
+    def tidy_up(self, text):
        # May need tweaking and finetuning
        def check_escaping(text, tests):
            for t in tests:
                # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
                txt = '%s' % t
                if txt != '%':
                    text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
                    text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
                text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
            return text
-            if not self.opts.keep_links:
+        # Now tidyup links and ids - remove ones that don't have a correponding opposite
-                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
+        if self.opts.keep_links:
-            if not self.opts.keep_image_references:
+            for i in self.our_links:
-                html = re.sub(r'<\s*img[^>]*>', '', html)
+                if i[0] == '#':
                    if i not in self.our_ids:
                        text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
            for i in self.our_ids:
                if i not in self.our_links:
                    text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
        # Remove obvious non-needed escaping, add sub/sup-script ones
        text = check_escaping(text, ['\*', '_', '\*'])
        # escape the super/sub-scripts if needed
        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
        # escape the super/sub-scripts if needed
        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
-            text = html2textile(html)
+        #remove empty spans
        text = re.sub(r'%\xa0+', r'%', text)
        #remove empty spans - MAY MERGE SOME ?
        text = re.sub(r'%%', r'', text)
        #remove spans from tagged output
        text = re.sub(r'%([_+*-]+)%', r'\1', text)
        #remove spaces before a newline
        text = re.sub(r' +\n', r'\n', text)
        #remove newlines at top of file
        text = re.sub(r'^\n+', r'', text)
        #correct blockcode paras
        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
        #correct blockquote paras
        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
-            # Ensure the section ends with at least two new line characters.
+        #reduce blank lines
-            # This is to prevent the last paragraph from a section being
+        text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
-            # combined into the fist paragraph of the next.
+        text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
-            end_chars = text[-4:]
+        #Check span following blank para
-            # Convert all newlines to \n
+        text = re.sub(r'\n+ +%', r' %', text)
-            end_chars = end_chars.replace('\r\n', '\n')
+        text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
-            end_chars = end_chars.replace('\r', '\n')
+        # blank paragraph
-            end_chars = end_chars[-2:]
+        text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
-            if not end_chars[1] == '\n':
+        # blank paragraph
-                text += '\n\n'
+        text = re.sub(u'\n\xa0', r'\np. ', text)
-            if end_chars[1] == '\n' and not end_chars[0] == '\n':
+        # blank paragraph
-                text += '\n'
+        text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
        text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
        text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
        #sort out spaces in tables
        text = re.sub(r' {2,}\|', r' |', text)
-            output += text
+        # Now put back spaces removed earlier as they're needed here
        text = re.sub(r'\np\.\n', r'\np. \n', text)
        #reduce blank lines
        text = re.sub(r' \n\n\n', r' \n\n', text)
-        output = u''.join(output)
+        return text
-        return output
+    def remove_newlines(self, text):
        text = text.replace('\r\n', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('\r', ' ')
        # Condense redundant spaces created by replacing newlines with spaces.
        text = re.sub(r'[ ]{2,}', ' ', text)
        text = re.sub(r'\t+', '', text)
        if self.remove_space_after_newline == True:
            text = re.sub(r'^ +', '', text)
            self.remove_space_after_newline = False
        return text
    def check_styles(self, style):
        txt = '{'
        if self.opts.keep_color:
            if 'color' in style.cssdict() and style['color'] != 'black':
                txt += 'color:'+style['color']+';'
            if 'background' in style.cssdict():
                txt += 'background:'+style['background']+';'
        txt += '}'
        if txt == '{}': txt = ''
        return txt
    def check_halign(self, style):
        tests = {'left':'<','justify':'<>','center':'=','right':'>'}
        for i in tests:
            if style['text-align'] == i:
                return tests[i]
        return ''
    def check_valign(self, style):
        tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
        for i in tests:
            if style['vertical-align'] == i:
                return tests[i]
        return ''
    def check_padding(self, style, stylizer):
        txt = ''
        left_padding_pts = 0
        left_margin_pts = 0
        if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
            left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
        if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
            left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
        left = left_margin_pts + left_padding_pts
        emleft = int(round(left / stylizer.profile.fbase))
        if emleft >= 1:
            txt += '(' * emleft
        right_padding_pts = 0
        right_margin_pts = 0
        if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
            right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
        if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
            right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
        right = right_margin_pts + right_padding_pts
        emright = int(round(right / stylizer.profile.fbase))
        if emright >= 1:
            txt += ')' * emright
        return txt
    def check_id_tag(self, attribs):
        txt = ''
        if attribs.has_key('id'):
            txt = '(#'+attribs['id']+ ')'
            self.our_ids.append('#'+attribs['id'])
            self.id_no_text = u'\xa0'
        return txt
    def build_block(self, tag, style, attribs, stylizer):
        txt = '\n' + tag
        if self.opts.keep_links:
            txt += self.check_id_tag(attribs)
        txt += self.check_padding(style, stylizer)
        txt += self.check_halign(style)
        txt += self.check_styles(style)
        return txt
    def prepare_string_for_textile(self, txt):
        if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
            return ' ==%s== ' % txt
        return txt
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''
        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']
        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib
        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return ['']
        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            if tag == 'div':
                tag = 'p'
            text.append(self.build_block(tag, style, attribs, stylizer))
            text.append('. ')
            tags.append('\n')
        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:
                    if self.in_a_link:
                        text.append('_')
                        tags.append('_')
                    else:
                        text.append('[_')
                        tags.append('_]')
                    self.style_embed.append('_')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:
                    if self.in_a_link:
                        text.append('*')
                        tags.append('*')
                    else:
                        text.append('[*')
                        tags.append('*]')
                    self.style_embed.append('*')
                    self.style_bold = True
        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
            if tag != 'a':
                if self.style_under == False:
                    text.append('[+')
                    tags.append('+]')
                    self.style_embed.append('+')
                    self.style_under = True
        if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
            if self.style_strike == False:
                text.append('[-')
                tags.append('-]')
                self.style_embed.append('-')
                self.style_strike = True
        if tag == 'br':
            for i in reversed(self.style_embed):
                text.append(i)
            text.append('\n')
            for i in self.style_embed:
                text.append(i)
            tags.append('')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            text.append('\nbq. ')
            tags.append('\n')
        elif tag in ('abbr', 'acronym'):
            text.append('')
            txt = attribs['title']
            tags.append('(' + txt + ')')
        elif tag == 'sup':
            text.append('^')
            tags.append('^')
        elif tag == 'sub':
            text.append('~')
            tags.append('~')
        elif tag == 'code':
            if self.in_pre:
                text.append('\nbc. ')
                tags.append('')
            else:
                text.append('@')
                tags.append('@')
        elif tag == 'cite':
            text.append('??')
            tags.append('??')
        elif tag == 'hr':
            text.append('\n***')
            tags.append('\n')
        elif tag == 'pre':
            self.in_pre = True
            text.append('\npre. ')
            tags.append('pre\n')
        elif tag == 'a':
            if self.opts.keep_links:
                if attribs.has_key('href'):
                    text.append('"')
                    tags.append('a')
                    tags.append('":' + attribs['href'])
                    self.our_links.append(attribs['href'])
                    if attribs.has_key('title'):
                        tags.append('(' + attribs['title'] + ')')
                    self.in_a_link = True
                else:
                    text.append('%')
                    tags.append('%')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!' + self.check_halign(style)
                txt += self.check_valign(style)
                txt += attribs['src']
                text.append(txt)
                if attribs.has_key('alt'):
                    txt = attribs['alt']
                    if txt != '':
                        text.append('(' + txt + ')')
                tags.append('!')
        elif tag in ('ol', 'ul'):
            self.list.append({'name': tag, 'num': 0})
            text.append('')
            tags.append(tag)
        elif tag == 'li':
            if self.list: li = self.list[-1]
            else: li = {'name': 'ul', 'num': 0}
            text.append('\n')
            if li['name'] == 'ul':
                text.append('*' * len(self.list) + ' ')
            elif li['name'] == 'ol':
                text.append('#' * len(self.list) + ' ')
            tags.append('')
        elif tag == 'dl':
            text.append('\n')
            tags.append('')
        elif tag == 'dt':
            text.append('')
            tags.append('\n')
        elif tag == 'dd':
            text.append('    ')
            tags.append('')
        elif tag == 'dd':
            text.append('')
            tags.append('\n')
        elif tag == 'table':
            txt = self.build_block(tag, style, attribs, stylizer)
            txt += '. \n'
            if txt != '\ntable. \n':
                text.append(txt)
            else:
                text.append('\n')
            tags.append('')
        elif tag == 'tr':
            txt = self.build_block('', style, attribs, stylizer)
            txt += '. '
            if txt != '\n. ':
                txt = re.sub ('\n', '', txt)
                text.append(txt)
            tags.append('|\n')
        elif tag == 'td':
            text.append('|')
            txt = ''
            txt += self.check_halign(style)
            txt += self.check_valign(style)
            if attribs.has_key ('colspan'):
                txt += '\\' + attribs['colspan']
            if attribs.has_key ('rowspan'):
                txt += '/' + attribs['rowspan']
            txt += self.check_styles(style)
            if txt != '':
                text.append(txt + '. ')
            tags.append('')
        elif tag == 'th':
            text.append('|_. ')
            tags.append('')
        elif tag == 'span':
            if style['font-variant'] == 'small-caps':
                if self.style_smallcap == False:
                    text.append('&')
                    tags.append('&')
                    self.style_smallcap = True
            else:
                if self.in_a_link == False:
                    txt = '%'
                    if self.opts.keep_links:
                        txt += self.check_id_tag(attribs)
                        txt += self.check_styles(style)
                    if txt != '%':
                        text.append(txt)
                        tags.append('%')
        if self.opts.keep_links and attribs.has_key('id'):
            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
                text.append(self.check_id_tag(attribs))
        # Process the styles for any that we want to keep
        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
                'span', 'table', 'tr', 'td'):
            if not self.in_a_link:
                text.append(self.check_styles(style))
        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if not self.in_pre:
                txt = self.prepare_string_for_textile(self.remove_newlines(txt))
            text.append(txt)
            self.id_no_text = u''
        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)
        # Close all open tags.
        tags.reverse()
        for t in tags:
            if tag in ('pre', 'ul', 'ol', 'li', 'table'):
                if tag == 'pre':
                    self.in_pre = False
                elif tag in ('ul', 'ol'):
                    if self.list: self.list.pop()
                    if not self.list: text.append('\n')
            else:
                if t == 'a':
                    self.in_a_link = False
                    t = ''
                text.append(self.id_no_text)
                self.id_no_text = u''
                if t in ('*]', '*'):
                    self.style_bold = False
                elif t in ('_]', '_'):
                    self.style_italic = False
                elif t == '+]':
                    self.style_under = False
                elif t == '-]':
                    self.style_strike = False
                elif t == '&':
                    self.style_smallcap = False
                if t in ('*]', '_]', '+]', '-]', '*', '_'):
                    txt = self.style_embed.pop()
                text.append('%s' % t)
        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)
        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if not self.in_pre:
                tail = self.prepare_string_for_textile(self.remove_newlines(tail))
            text.append(tail)
        return text
--- a/src/calibre/ebooks/txt/unsmarten.py
+++ b/src/calibre/ebooks/txt/unsmarten.py
@ -0,0 +1,108 @@
 # -*- coding: utf-8 -*-
 """unsmarten : html2textile helper function"""
 __version__ = '0.1'
 __author__ = 'Leigh Parry'
 import re
 def unsmarten(txt):
    txt = re.sub(u'&#8211;|&ndash;|–', r'-', txt) # en-dash
    txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
    txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt)  # apostrophe
    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
    txt = re.sub(u'&#162;|&cent;|¢',     r'{c\}',  txt)  # cent
    txt = re.sub(u'&#163;|&pound;|£',    r'{L-}',  txt)  # pound
    txt = re.sub(u'&#165;|&yen;|¥',      r'{Y=}',  txt)  # yen
    txt = re.sub(u'&#169;|&copy;|©',     r'{(c)}', txt)  # copyright
    txt = re.sub(u'&#174;|&reg;|®',      r'{(r)}', txt)  # registered
    txt = re.sub(u'&#188;|&frac14;|¼',   r'{1/4}', txt)  # quarter
    txt = re.sub(u'&#189;|&frac12;|½',   r'{1/2}', txt)  # half
    txt = re.sub(u'&#190;|&frac34;|¾',   r'{3/4}', txt)  # three-quarter
    txt = re.sub(u'&#192;|&Agrave;|À',   r'{A`)}', txt)  # A-grave
    txt = re.sub(u'&#193;|&Aacute;|Á',   r"{A'}",  txt)  # A-acute
    txt = re.sub(u'&#194;|&Acirc;|Â',    r'{A^}', txt)  # A-circumflex
    txt = re.sub(u'&#195;|&Atilde;|Ã',   r'{A~}',  txt)  # A-tilde
    txt = re.sub(u'&#196;|&Auml;|Ä',     r'{A"}',  txt)  # A-umlaut
    txt = re.sub(u'&#197;|&Aring;|Å',    r'{Ao}',  txt)  # A-ring
    txt = re.sub(u'&#198;|&AElig;|Æ',    r'{AE}',  txt)  # AE
    txt = re.sub(u'&#199;|&Ccedil;|Ç',   r'{C,}',  txt)  # C-cedilla
    txt = re.sub(u'&#200;|&Egrave;|È',   r'{E`}',  txt)  # E-grave
    txt = re.sub(u'&#201;|&Eacute;|É',   r"{E'}",  txt)  # E-acute
    txt = re.sub(u'&#202;|&Ecirc;|Ê',    r'{E^}', txt)  # E-circumflex
    txt = re.sub(u'&#203;|&Euml;|Ë',     r'{E"}',  txt)  # E-umlaut
    txt = re.sub(u'&#204;|&Igrave;|Ì',   r'{I`}',  txt)  # I-grave
    txt = re.sub(u'&#205;|&Iacute;|Í',   r"{I'}",  txt)  # I-acute
    txt = re.sub(u'&#206;|&Icirc;|Î',    r'{I^}', txt)  # I-circumflex
    txt = re.sub(u'&#207;|&Iuml;|Ï',     r'{I"}',  txt)  # I-umlaut
    txt = re.sub(u'&#208;|&ETH;|Ð',      r'{D-}',  txt)  # ETH
    txt = re.sub(u'&#209;|&Ntilde;|Ñ',   r'{N~}',  txt)  # N-tilde
    txt = re.sub(u'&#210;|&Ograve;|Ò',   r'{O`}',  txt)  # O-grave
    txt = re.sub(u'&#211;|&Oacute;|Ó',   r"{O'}",  txt)  # O-acute
    txt = re.sub(u'&#212;|&Ocirc;|Ô',    r'{O^}', txt)  # O-circumflex
    txt = re.sub(u'&#213;|&Otilde;|Õ',   r'{O~}',  txt)  # O-tilde
    txt = re.sub(u'&#214;|&Ouml;|Ö',     r'{O"}',  txt)  # O-umlaut
    txt = re.sub(u'&#215;|&times;|×',    r'{x}',   txt)  # dimension
    txt = re.sub(u'&#216;|&Oslash;|Ø',   r'{O/}',  txt)  # O-slash
    txt = re.sub(u'&#217;|&Ugrave;|Ù',   r"{U`}",  txt)  # U-grave
    txt = re.sub(u'&#218;|&Uacute;|Ú',   r"{U'}",  txt)  # U-acute
    txt = re.sub(u'&#219;|&Ucirc;|Û',    r'{U^}', txt)  # U-circumflex
    txt = re.sub(u'&#220;|&Uuml;|Ü',     r'{U"}',  txt)  # U-umlaut
    txt = re.sub(u'&#221;|&Yacute;|Ý',   r"{Y'}",  txt)  # Y-grave
    txt = re.sub(u'&#223;|&szlig;|ß',    r'{sz}',  txt)  # sharp-s
    txt = re.sub(u'&#224;|&agrave;|à',   r'{a`}',  txt)  # a-grave
    txt = re.sub(u'&#225;|&aacute;|á',   r"{a'}",  txt)  # a-acute
    txt = re.sub(u'&#226;|&acirc;|â',    r'{a^}', txt)  # a-circumflex
    txt = re.sub(u'&#227;|&atilde;|ã',   r'{a~}',  txt)  # a-tilde
    txt = re.sub(u'&#228;|&auml;|ä',     r'{a"}',  txt)  # a-umlaut
    txt = re.sub(u'&#229;|&aring;|å',    r'{ao}',  txt)  # a-ring
    txt = re.sub(u'&#230;|&aelig;|æ',    r'{ae}',  txt)  # ae
    txt = re.sub(u'&#231;|&ccedil;|ç',   r'{c,}',  txt)  # c-cedilla
    txt = re.sub(u'&#232;|&egrave;|è',   r'{e`}',  txt)  # e-grave
    txt = re.sub(u'&#233;|&eacute;|é',   r"{e'}",  txt)  # e-acute
    txt = re.sub(u'&#234;|&ecirc;|ê',    r'{e^}', txt)  # e-circumflex
    txt = re.sub(u'&#235;|&euml;|ë',     r'{e"}',  txt)  # e-umlaut
    txt = re.sub(u'&#236;|&igrave;|ì',   r'{i`}',  txt)  # i-grave
    txt = re.sub(u'&#237;|&iacute;|í',   r"{i'}",  txt)  # i-acute
    txt = re.sub(u'&#238;|&icirc;|î',    r'{i^}', txt)  # i-circumflex
    txt = re.sub(u'&#239;|&iuml;|ï',     r'{i"}',  txt)  # i-umlaut
    txt = re.sub(u'&#240;|&eth;|ð',      r'{d-}',  txt)  # eth
    txt = re.sub(u'&#241;|&ntilde;|ñ',   r'{n~}',  txt)  # n-tilde
    txt = re.sub(u'&#242;|&ograve;|ò',   r'{o`}',  txt)  # o-grave
    txt = re.sub(u'&#243;|&oacute;|ó',   r"{o'}",  txt)  # o-acute
    txt = re.sub(u'&#244;|&ocirc;|ô',    r'{o^}', txt)  # o-circumflex
    txt = re.sub(u'&#245;|&otilde;|õ',   r'{o~}',  txt)  # o-tilde
    txt = re.sub(u'&#246;|&ouml;|ö',     r'{o"}',  txt)  # o-umlaut
    txt = re.sub(u'&#248;|&oslash;|ø',   r'{o/}',  txt)  # o-stroke
    txt = re.sub(u'&#249;|&ugrave;|ù',   r'{u`}',  txt)  # u-grave
    txt = re.sub(u'&#250;|&uacute;|ú',   r"{u'}",  txt)  # u-acute
    txt = re.sub(u'&#251;|&ucirc;|û',    r'{u^}', txt)  # u-circumflex
    txt = re.sub(u'&#252;|&uuml;|ü',     r'{u"}',  txt)  # u-umlaut
    txt = re.sub(u'&#253;|&yacute;|ý',   r"{y'}",  txt)  # y-acute
    txt = re.sub(u'&#255;|&yuml;|ÿ',     r'{y"}',  txt)  # y-umlaut
    txt = re.sub(u'&#338;|&OElig;|Œ',    r'{OE}',  txt)  # OE
    txt = re.sub(u'&#339;|&oelig;|œ',    r'{oe}',  txt)  # oe
    txt = re.sub(u'&#348;|&Scaron;|Ŝ',   r'{S^}', txt)  # Scaron
    txt = re.sub(u'&#349;|&scaron;|ŝ',   r'{s^}', txt)  # scaron
    txt = re.sub(u'&#8226;|&bull;|•',    r'{*}',   txt)  # bullet
    txt = re.sub(u'&#8355;|₣',           r'{Fr}',  txt)  # Franc
    txt = re.sub(u'&#8356;|₤',           r'{L=}',  txt)  # Lira
    txt = re.sub(u'&#8360;|₨',           r'{Rs}',  txt)  # Rupee
    txt = re.sub(u'&#8364;|&euro;|€',    r'{C=}',  txt)  # euro
    txt = re.sub(u'&#8482;|&trade;|™',   r'{tm}',  txt)  # trademark
    txt = re.sub(u'&#9824;|&spades;|♠',  r'{spade}',   txt)  # spade
    txt = re.sub(u'&#9827;|&clubs;|♣',   r'{club}',    txt)  # club
    txt = re.sub(u'&#9829;|&hearts;|♥',  r'{heart}',   txt)  # heart
    txt = re.sub(u'&#9830;|&diams;|♦',   r'{diamond}', txt)  # diamond
    # Move into main code?
 #    txt = re.sub(u'\xa0',   r'p. ', txt)              # blank paragraph
 #    txt = re.sub(u'\n\n\n\n',   r'\n\np. \n\n', txt)  # blank paragraph
 #    txt = re.sub(u'\n  \n',   r'\n<br />\n', txt)     # blank paragraph - br tag
    return txt
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
        Widget.__init__(self, parent,
        ['newline', 'max_line_length', 'force_max_line_length',
        'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references',
-        'txt_output_encoding'])
+        'keep_color', 'txt_output_encoding'])
        self.db, self.book_id = db, book_id
        for x in get_option('newline').option.choices:
            self.opt_newline.addItem(x)
--- a/src/calibre/gui2/convert/txt_output.ui
+++ b/src/calibre/gui2/convert/txt_output.ui
@ -122,6 +122,13 @@
        </property>
       </widget>
      </item>
      <item>
       <widget class="QCheckBox" name="opt_keep_color">
        <property name="text">
         <string>Keep text color, when possible</string>
        </property>
       </widget>
      </item>
     </layout>
    </widget>
   </item>
--- a/src/calibre/gui2/store/wizards_tower_books_plugin.py
+++ b/src/calibre/gui2/store/wizards_tower_books_plugin.py
@ -29,7 +29,7 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
            detail_item = self.url + detail_item
        if external or self.config.get('open_external', False):
-            open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
+            open_url(QUrl(url_slash_cleaner(detail_item)))
        else:
            d = WebStoreDialog(self.gui, self.url, parent, detail_item)
            d.setWindowTitle(self.name)
@ -38,9 +38,9 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
    def search(self, query, max_results=10, timeout=60):
        url = 'http://www.wizardstowerbooks.com/search.html?for=' + urllib.quote(query)
-        
+
        br = browser()
-        
+
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
@ -60,13 +60,13 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
                price = price.strip()
                if not price:
                    continue
-                
+
                title = ''.join(data.xpath('.//span[@class="prti"]/a/b/text()'))
                author = ''.join(data.xpath('.//p[@class="last"]/text()'))
                a, b, author = author.partition(' by ')
-                
+
                counter -= 1
-                
+
                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
@ -74,15 +74,15 @@ class WizardsTowerBooksStore(BasicStoreConfig, StorePlugin):
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
-                
+
                yield s
    def get_details(self, search_result, timeout):
        br = browser()
        with closing(br.open(url_slash_cleaner(self.url + search_result.detail_item), timeout=timeout)) as nf:
            idata = html.fromstring(nf.read())
-        
+
            formats = ', '.join(idata.xpath('//select[@id="N1_"]//option//text()'))
            search_result.formats = formats.upper()
-            
+
        return True
--- a/src/calibre/utils/html2textile.py
+++ b/src/calibre/utils/html2textile.py
@ -1,209 +0,0 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #    * Redistributions of source code must retain the above copyright
 #      notice, this list of conditions and the following disclaimer.
 #    * Redistributions in binary form must reproduce the above copyright
 #      notice, this list of conditions and the following disclaimer in the
 #      documentation and/or other materials provided with the distribution.
 #    * Neither the name of the <organization> nor the
 #      names of its contributors may be used to endorse or promote products
 #      derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 from lxml import etree
 from calibre.ebooks.oeb.base import barename
 class EchoTarget:
    def __init__(self):
        self.final_output = []
        self.block = False
        self.ol_ident = 0
        self.ul_ident = 0
        self.list_types = []
        self.haystack = []
    def start(self, tag, attrib):
        tag = barename(tag)
        newline = '\n'
        dot = ''
        new_tag = ''
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
            new_tag = tag
            dot = '. '
        elif tag == 'p':
                new_tag = ''
                dot = ''
        elif tag == 'blockquote':
            new_tag = 'bq'
            dot = '. '
        elif tag in ('b', 'strong'):
            new_tag = '*'
            newline = ''
        elif tag in ('em', 'i'):
            new_tag = '_'
            newline = ''
        elif tag == 'cite':
            new_tag = '??'
            newline = ''
        elif tag == 'del':
            new_tag = '-'
            newline = ''
        elif tag == 'ins':
            new_tag = '+'
            newline = ''
        elif tag == 'sup':
            new_tag = '^'
            newline = ''
        elif tag == 'sub':
            new_tag = '~'
            newline = ''
        elif tag == 'span':
            new_tag = ''
            newline = ''
        elif tag == 'a':
            self.block = True
            if 'title' in attrib:
                self.a_part = {'title':attrib.get('title'),
                               'href':attrib.get('href', '')}
            else:
                self.a_part = {'title':None, 'href':attrib.get('href', '')}
            new_tag = ''
            newline = ''
        elif tag == 'img':
            if 'alt' in attrib:
                new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
            else:
                new_tag = ' !%s' % attrib.get('src')
            newline = ''
        elif tag in ('ul', 'ol'):
            new_tag = ''
            newline = ''
            self.list_types.append(tag)
            if tag == 'ul':
                self.ul_ident += 1
            else:
                self.ol_ident += 1
        elif tag == 'li':
            indent = self.ul_ident + self.ol_ident
            if self.list_types[-1] == 'ul':
                new_tag = '*' * indent + ' '
                newline = '\n'
            else:
                new_tag = '#' * indent + ' '
                newline = '\n'
        if tag not in ('ul', 'ol'):
            textile = '%(newline)s%(tag)s%(dot)s' % \
                                 {
                                  'newline':newline,
                                  'tag':new_tag,
                                  'dot':dot
                                  }
            if not self.block:
                self.final_output.append(textile)
            else:
                self.haystack.append(textile)
    def end(self, tag):
        tag = barename(tag)
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
            self.final_output.append('\n')
        elif tag in ('b', 'strong'):
            self.final_output.append('*')
        elif tag in ('em', 'i'):
            self.final_output.append('_')
        elif tag == 'cite':
            self.final_output.append('??')
        elif tag == 'del':
            self.final_output.append('-')
        elif tag == 'ins':
            self.final_output.append('+')
        elif tag == 'sup':
            self.final_output.append('^')
        elif tag == 'sub':
            self.final_output.append('~')
        elif tag == 'span':
            self.final_output.append('')
        elif tag == 'a':
            if self.a_part['title']:
                textilized = ' "%s (%s)":%s ' % (
                                                 ''.join(self.haystack),
                                                 self.a_part.get('title'),
                                                 self.a_part.get('href'),
                                                 )
                self.haystack = []
            else:
                textilized = ' "%s":%s ' % (
                                                 ''.join(self.haystack),
                                                 self.a_part.get('href'),
                                                 )
                self.haystack = []
            self.final_output.append(textilized)
            self.block = False
        elif tag == 'img':
            self.final_output.append('!')
        elif tag == 'ul':
            self.ul_ident -= 1
            self.list_types.pop()
            if len(self.list_types) == 0:
                self.final_output.append('\n')
        elif tag == 'ol':
            self.ol_ident -= 1
            self.list_types.pop()
            if len(self.list_types) == 0:
                self.final_output.append('\n')
    def data(self, data):
        #we dont want any linebreaks inside our tags
        node_data = data.replace('\n','')
        if not self.block:
            self.final_output.append(node_data)
        else:
            self.haystack.append(node_data)
    def comment(self, text):
        pass
    def close(self):
        return "closed!"
 def html2textile(html):
    #1st pass
    #clean the whitespace and convert html to xhtml
    parser = etree.HTMLParser()
    tree = etree.fromstring(html, parser)
    xhtml = etree.tostring(tree, method="xml")
    parser = etree.XMLParser(remove_blank_text=True)
    root = etree.XML(xhtml, parser)
    cleaned_html = etree.tostring(root)
    #2nd pass build textile
    target = EchoTarget()
    parser = etree.XMLParser(target=target)
    root = etree.fromstring(cleaned_html, parser)
    textilized_text = ''.join(target.final_output).lstrip().rstrip()
    return textilized_text