From 804b248d46c71e5169c57da794ec2f69f2998dbf Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 16 Apr 2011 11:55:44 -0400
Subject: [PATCH 01/25] Add new but still wip textile output generator.

---
 src/calibre/ebooks/txt/output.py    |  21 +-
 src/calibre/ebooks/txt/textileml.py | 341 +++++++++++++++++++++++++---
 src/calibre/ebooks/txt/unsmarten.py | 109 +++++++++
 3 files changed, 432 insertions(+), 39 deletions(-)
 create mode 100644 src/calibre/ebooks/txt/unsmarten.py

diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 4e54a97b45..7b50afb345 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin):
      ])
 
     def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        print 'New'
         if opts.txt_output_formatting.lower() == 'markdown':
             from calibre.ebooks.txt.markdownml import MarkdownMLizer
-            writer = MarkdownMLizer(log)
+            self.writer = MarkdownMLizer(log)
         elif opts.txt_output_formatting.lower() == 'textile':
             from calibre.ebooks.txt.textileml import TextileMLizer
-            writer = TextileMLizer(log)
+            self.writer = TextileMLizer(log)
         else:
-            writer = TXTMLizer(log)
+            self.writer = TXTMLizer(log)
 
-        txt = writer.extract_content(oeb_book, opts)
+        txt = self.writer.extract_content(oeb_book, opts)
         txt = clean_ascii_chars(txt)
 
         log.debug('\tReplacing newlines with selected type...')
@@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput):
             # Images
             for item in oeb_book.manifest:
                 if item.media_type in OEB_IMAGES:
-                    path = os.path.join(tdir, os.path.dirname(item.href))
+                    if hasattr(self.writer, 'images'):
+                        path = os.path.join(tdir, 'images')
+                        if item.href in self.writer.images:
+                            href = self.writer.images[item.href]
+                        else:
+                            continue
+                    else:
+                        path = os.path.join(tdir, os.path.dirname(item.href))
+                        href = os.path.basename(item.href)
                     if not os.path.exists(path):
                         os.makedirs(path)
-                    with open(os.path.join(tdir, item.href), 'wb') as imgf:
+                    with open(os.path.join(path, href), 'wb') as imgf:
                         imgf.write(item.data)
             
             # Metadata
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index d7e11695c5..9651fa8971 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 __license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
 __docformat__ = 'restructuredtext en'
 
 '''
@@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text
 
 import re
 
-from lxml import etree
+from functools import partial
 
-from calibre.ebooks.oeb.base import XHTML
-from calibre.utils.html2textile import html2textile
+from calibre.ebooks.htmlz.oeb2html import OEB2HTML
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
+from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.ebooks.txt.unsmarten import unsmarten
+from operator import itemgetter
 
-class TextileMLizer(object):
 
-    def __init__(self, log):
-        self.log = log
+class TextileMLizer(OEB2HTML):
 
     def extract_content(self, oeb_book, opts):
         self.log.info('Converting XHTML to Textile formatted TXT...')
-        self.oeb_book = oeb_book
         self.opts = opts
+        self.in_pre = False
+        self.in_table = False
+        self.links = {}
+        self.list = []
+        self.images = {}
+        self.base_hrefs = [item.href for item in oeb_book.spine]
+        self.map_resources(oeb_book)
 
-        return self.mlize_spine()
+        self.style_bold = False
+        self.style_italic = False
+        self.style_under = False
+        self.style_strike = False
+        self.style_smallcap = False
 
-    def mlize_spine(self):
+        txt = self.mlize_spine(oeb_book)
+        txt = unsmarten(txt)
+
+        # Do some tidying up
+        txt = self.tidy_up(txt)
+
+        return txt
+
+    def mlize_spine(self, oeb_book):
         output = [u'']
-
-        for item in self.oeb_book.spine:
+        for item in oeb_book.spine:
             self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
+            self.rewrite_ids(item.data, item)
+            rewrite_links(item.data, partial(self.rewrite_link, page=item))
+            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+            output.append('\n\n')
+        return ''.join(output)
 
-            html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
+    def tidy_up(self, text):
+        def check_count(text, tests):
+            x = []
+            for i, t in enumerate(reversed(tests)):
+                x.append((text.count(t), i, t))
+            if x:
+                return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
+            return ''
 
-            if not self.opts.keep_links:
-                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
-            if not self.opts.keep_image_references:
-                html = re.sub(r'<\s*img[^>]*>', '', html)
+        # NEEDS TWEAKING
+#        def check_escaping(text, tests):
+#            for t in tests:
+#                text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text)
+#                text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text)
+#            return text
 
-            text = html2textile(html)
+        txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
+        text = re.sub(txt+'(\S)', r'\n\1', text)
 
-            # Ensure the section ends with at least two new line characters.
-            # This is to prevent the last paragraph from a section being
-            # combined into the fist paragraph of the next.
-            end_chars = text[-4:]
-            # Convert all newlines to \n
-            end_chars = end_chars.replace('\r\n', '\n')
-            end_chars = end_chars.replace('\r', '\n')
-            end_chars = end_chars[-2:]
-            if not end_chars[1] == '\n':
-                text += '\n\n'
-            if end_chars[1] == '\n' and not end_chars[0] == '\n':
-                text += '\n'
+#        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
 
-            output += text
+        text = re.sub('\npre\. bc\.', '\nbc.', text)
+        text = re.sub('\np=. p. ', '\np. ', text)
+        text = re.sub('\np=. \n', '\n', text)
+        text = re.sub('\n{3,}', '\n\n', text)
+        text = re.sub(' \|', '|', text)
 
-        output = u''.join(output)
+        # started work on trying to fix footnotes
+#        text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text)
+        return text
 
-        return output
+    def remove_newlines(self, text):
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)
+        text = re.sub(r'\t+', '', text)
+        return text
+
+    def remove_leading_ws(self, text):
+        text = text.replace('\r\n', '\n')
+        text = text.replace('\r', '\n')
+        text = re.sub(r'\n+', '\n', text)
+        text = re.sub(r'\n[\t ]+', '\n', text)
+        return text
+
+    def check_align(self, style, align, tests):
+        for i in tests:
+            if style[align] == i[0]:
+                return i[1]
+        return ''
+
+    def check_padding(self, style, tests):
+        txt = ''
+        for i in tests:
+            try:
+                ems = int(round(float(style[i[0]] / style['font-size'])))
+                if ems >=1:
+                    txt += i[1] * ems
+            except:
+                pass
+        return txt
+
+    def check_id_tag(self, attribs):
+        txt = ''
+        if attribs.has_key('id'):
+            txt = '(#'+attribs['id']+')'
+        return txt
+
+    def build_block(self, tag, style, attribs, finish):
+        txt = tag
+        if self.opts.keep_links:
+            txt += self.check_id_tag(attribs)
+        txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
+        txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
+        txt += finish
+        return txt
+
+    def dump_text(self, elem, stylizer, page, tag_stack=[]):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        '''
+
+        # We can only processes tags. If there isn't a tag return any text.
+        if not isinstance(elem.tag, basestring) \
+           or namespace(elem.tag) != XHTML_NS:
+            p = elem.getparent()
+            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
+                    and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        # Setup our variables.
+        text = ['']
+        style = stylizer.style(elem)
+        tags = []
+        tag = barename(elem.tag)
+        attribs = elem.attrib
+
+        # Ignore anything that is set to not be displayed.
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            return ['']
+
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
+            text.append(self.build_block(tag, style, attribs, '. '))
+            tags.append('\n')
+
+        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
+                if self.style_bold == False:
+                    text.append('*')
+                    tags.append('*')
+                    self.style_bold = True
+        if style['font-style'] == 'italic' or tag in ('i', 'em'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
+                if self.style_italic == False:
+                    text.append('_')
+                    tags.append('_')
+                    self.style_italic = True
+        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
+            if tag != 'a':
+                if self.style_under == False:
+                    text.append('+')
+                    tags.append('+')
+                    self.style_under = True
+        if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
+            if self.style_strike == False:
+                text.append('-')
+                tags.append('-')
+                self.style_strike = True
+        if style['font-variant'] == 'small-caps':
+            if self.style_smallcap == 0:
+                text.append('&')
+                tags.append('&')
+                self.style_smallcap = 1
+        if tag == 'br':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'blockquote':
+            text.append('bq. ')
+            tags.append('\n')
+        elif tag in ('abbr', 'acronym'):
+            text.append('')
+            txt = attribs['title']
+            tags.append('(' + txt + ')')
+        elif tag == 'sup':
+            text.append('^')
+            tags.append('^')
+        elif tag == 'sub':
+            text.append('~')
+            tags.append('~')
+        elif tag == 'code':
+            if self.in_pre:
+                text.append('bc. ')
+                tags.append('\n')
+            else:
+                text.append('@')
+                tags.append('@')
+        elif tag == 'cite':
+            text.append('??')
+            tags.append('??')
+        elif tag == 'hr':
+            text.append('\n***\n')
+            tags.append('\n')
+        elif tag == 'pre':
+            self.in_pre = True
+            text.append('pre. ')
+            tags.append('pre')
+        elif tag == 'a':
+            if self.opts.keep_links:
+                text.append ('"')
+                tags.append('":' + attribs['href'])
+                if attribs.has_key('title'):
+                    tags.append('(' + attribs['title'] + ')')
+        elif tag == 'img':
+            if self.opts.keep_image_references:
+                text.append ('!' + attribs['src'])
+                if attribs.has_key('alt'):
+                    txt = attribs['alt']
+                    if txt != '':
+                        text.append('(' + txt + ')')
+                tags.append('!')
+        elif tag in ('ol', 'ul'):
+            self.list.append({'name':tag, 'num':0})
+            text.append('')
+            tags.append(tag)
+        elif tag == 'li':
+            if self.list: li = self.list[-1]
+            else: li = {'name':'ul', 'num':0}
+            if   li['name'] == 'ul': text.append('*'*len(self.list)+' ')
+            elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
+        elif tag == 'dl':
+            text.append('\n')
+            tags.append('')
+        elif tag == 'dt':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'dd':
+            text.append('    ')
+            tags.append('')
+        elif tag == 'dd':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'table':
+            self.in_table = True
+            text.append('')
+            tags.append('table')
+        elif tag == 'tr':
+            text.append('')
+            tags.append('|\n')
+        elif tag == 'td':
+            text.append('|')
+            txt = ''
+            txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
+            txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']])
+            if attribs.has_key ('colspan'):
+                txt += '\\' + attribs['colspan']
+            if attribs.has_key ('rowspan'):
+                txt += '/' + attribs['rowspan']
+            if txt != '':
+                text.append(txt+'. ')
+            tags.append('')
+        elif tag == 'th':
+            text.append('|_. ')
+            tags.append('')
+
+        if self.opts.keep_links and attribs.has_key('id'):
+            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
+                text.append('(#' + attribs['id'] + ')')
+
+        # If wanted process all style tags here - before taxt in tags is written
+
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            txt = elem.text
+            if not self.in_pre:
+                if self.in_table:
+                    txt = self.remove_newlines(txt)
+                else:
+                    txt = self.remove_leading_ws(txt)
+            text.append(txt)
+
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += self.dump_text(item, stylizer, page, tag_stack+tags)
+
+        # Close all open tags.
+        tags.reverse()
+        for t in tags:
+            if tag in ('pre', 'ul', 'ol', 'li', 'table'):
+                if tag == 'pre':
+                    self.in_pre = False
+                if tag == 'table':
+                    self.in_table = False
+                if tag in ('ul', 'ol'):
+                    if self.list: self.list.pop()
+            else:
+                text.append('%s' % t)
+                if t == '*': self.style_bold = False
+                if t == '_': self.style_italic = False
+                if t == '+': self.style_under = False
+                if t == '-': self.style_strike = False
+                if t == '&': self.style_smallcap = False
+
+        # Add the text that is outside of the tag.
+        if hasattr(elem, 'tail') and elem.tail:
+            tail = elem.tail
+            if not self.in_pre:
+                if self.in_table:
+                    tail = self.remove_newlines(tail)
+                else:
+                    tail = self.remove_leading_ws(tail)
+            text.append(tail)
+
+        return text
diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py
new file mode 100644
index 0000000000..30a22bf069
--- /dev/null
+++ b/src/calibre/ebooks/txt/unsmarten.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+
+'''
+
+'''
+
+__version__ = '0.1'
+__author__ = 'Leigh Parry'
+
+import re
+
+def unsmarten(txt):
+    txt = re.sub(u'&#8211;|&ndash;|–', r'-', txt) # en-dash
+    txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
+    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
+
+    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
+    txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt)  # apostrophe
+    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
+
+    txt = re.sub(u'&#162;|&cent;|¢',     r'{c\}',  txt)  # cent
+    txt = re.sub(u'&#163;|&pound;|£',    r'{L-}',  txt)  # pound
+    txt = re.sub(u'&#165;|&yen;|¥',      r'{Y=}',  txt)  # yen
+    txt = re.sub(u'&#169;|&copy;|©',     r'{(c)}', txt)  # copyright
+    txt = re.sub(u'&#174;|&reg;|®',      r'{(r)}', txt)  # registered
+    txt = re.sub(u'&#188;|&frac14;|¼',   r'{1/4}', txt)  # quarter
+    txt = re.sub(u'&#189;|&frac12;|½',   r'{1/2}', txt)  # half
+    txt = re.sub(u'&#190;|&frac34;|¾',   r'{3/4}', txt)  # three-quarter
+    txt = re.sub(u'&#192;|&Agrave;|À',   r'{A`)}', txt)  # A-grave
+    txt = re.sub(u'&#193;|&Aacute;|Á',   r"{A'}",  txt)  # A-acute
+    txt = re.sub(u'&#194;|&Acirc;|Â',    r'{A^}', txt)  # A-circumflex
+    txt = re.sub(u'&#195;|&Atilde;|Ã',   r'{A~}',  txt)  # A-tilde
+    txt = re.sub(u'&#196;|&Auml;|Ä',     r'{A"}',  txt)  # A-umlaut
+    txt = re.sub(u'&#197;|&Aring;|Å',    r'{Ao}',  txt)  # A-ring
+    txt = re.sub(u'&#198;|&AElig;|Æ',    r'{AE}',  txt)  # AE
+    txt = re.sub(u'&#199;|&Ccedil;|Ç',   r'{C,}',  txt)  # C-cedilla
+    txt = re.sub(u'&#200;|&Egrave;|È',   r'{E`}',  txt)  # E-grave
+    txt = re.sub(u'&#201;|&Eacute;|É',   r"{E'}",  txt)  # E-acute
+    txt = re.sub(u'&#202;|&Ecirc;|Ê',    r'{E^}', txt)  # E-circumflex
+    txt = re.sub(u'&#203;|&Euml;|Ë',     r'{E"}',  txt)  # E-umlaut
+    txt = re.sub(u'&#204;|&Igrave;|Ì',   r'{I`}',  txt)  # I-grave
+    txt = re.sub(u'&#205;|&Iacute;|Í',   r"{I'}",  txt)  # I-acute
+    txt = re.sub(u'&#206;|&Icirc;|Î',    r'{I^}', txt)  # I-circumflex
+    txt = re.sub(u'&#207;|&Iuml;|Ï',     r'{I"}',  txt)  # I-umlaut
+    txt = re.sub(u'&#208;|&ETH;|Ð',      r'{D-}',  txt)  # ETH
+    txt = re.sub(u'&#209;|&Ntilde;|Ñ',   r'{N~}',  txt)  # N-tilde
+    txt = re.sub(u'&#210;|&Ograve;|Ò',   r'{O`}',  txt)  # O-grave
+    txt = re.sub(u'&#211;|&Oacute;|Ó',   r"{O'}",  txt)  # O-acute
+    txt = re.sub(u'&#212;|&Ocirc;|Ô',    r'{O^}', txt)  # O-circumflex
+    txt = re.sub(u'&#213;|&Otilde;|Õ',   r'{O~}',  txt)  # O-tilde
+    txt = re.sub(u'&#214;|&Ouml;|Ö',     r'{O"}',  txt)  # O-umlaut
+    txt = re.sub(u'&#215;|&times;|×',    r'{x}',   txt)  # dimension
+    txt = re.sub(u'&#216;|&Oslash;|Ø',   r'{O/}',  txt)  # O-slash
+    txt = re.sub(u'&#217;|&Ugrave;|Ù',   r"{U`}",  txt)  # U-grave
+    txt = re.sub(u'&#218;|&Uacute;|Ú',   r"{U'}",  txt)  # U-acute
+    txt = re.sub(u'&#219;|&Ucirc;|Û',    r'{U^}', txt)  # U-circumflex
+    txt = re.sub(u'&#220;|&Uuml;|Ü',     r'{U"}',  txt)  # U-umlaut
+    txt = re.sub(u'&#221;|&Yacute;|Ý',   r"{Y'}",  txt)  # Y-grave
+    txt = re.sub(u'&#223;|&szlig;|ß',    r'{sz}',  txt)  # sharp-s
+    txt = re.sub(u'&#224;|&agrave;|à',   r'{a`}',  txt)  # a-grave
+    txt = re.sub(u'&#225;|&aacute;|á',   r"{a'}",  txt)  # a-acute
+    txt = re.sub(u'&#226;|&acirc;|â',    r'{a^}', txt)  # a-circumflex
+    txt = re.sub(u'&#227;|&atilde;|ã',   r'{a~}',  txt)  # a-tilde
+    txt = re.sub(u'&#228;|&auml;|ä',     r'{a"}',  txt)  # a-umlaut
+    txt = re.sub(u'&#229;|&aring;|å',    r'{ao}',  txt)  # a-ring
+    txt = re.sub(u'&#230;|&aelig;|æ',    r'{ae}',  txt)  # ae
+    txt = re.sub(u'&#231;|&ccedil;|ç',   r'{c,}',  txt)  # c-cedilla
+    txt = re.sub(u'&#232;|&egrave;|è',   r'{e`}',  txt)  # e-grave
+    txt = re.sub(u'&#233;|&eacute;|é',   r"{e'}",  txt)  # e-acute
+    txt = re.sub(u'&#234;|&ecirc;|ê',    r'{e^}', txt)  # e-circumflex
+    txt = re.sub(u'&#235;|&euml;|ë',     r'{e"}',  txt)  # e-umlaut
+    txt = re.sub(u'&#236;|&igrave;|ì',   r'{i`}',  txt)  # i-grave
+    txt = re.sub(u'&#237;|&iacute;|í',   r"{i'}",  txt)  # i-acute
+    txt = re.sub(u'&#238;|&icirc;|î',    r'{i^}', txt)  # i-circumflex
+    txt = re.sub(u'&#239;|&iuml;|ï',     r'{i"}',  txt)  # i-umlaut
+    txt = re.sub(u'&#240;|&eth;|ð',      r'{d-}',  txt)  # eth
+    txt = re.sub(u'&#241;|&ntilde;|ñ',   r'{n~}',  txt)  # n-tilde
+    txt = re.sub(u'&#242;|&ograve;|ò',   r'{o`}',  txt)  # o-grave
+    txt = re.sub(u'&#243;|&oacute;|ó',   r"{o'}",  txt)  # o-acute
+    txt = re.sub(u'&#244;|&ocirc;|ô',    r'{o^}', txt)  # o-circumflex
+    txt = re.sub(u'&#245;|&otilde;|õ',   r'{o~}',  txt)  # o-tilde
+    txt = re.sub(u'&#246;|&ouml;|ö',     r'{o"}',  txt)  # o-umlaut
+    txt = re.sub(u'&#248;|&oslash;|ø',   r'{o/}',  txt)  # o-stroke
+    txt = re.sub(u'&#249;|&ugrave;|ù',   r'{u`}',  txt)  # u-grave
+    txt = re.sub(u'&#250;|&uacute;|ú',   r"{u'}",  txt)  # u-acute
+    txt = re.sub(u'&#251;|&ucirc;|û',    r'{u^}', txt)  # u-circumflex
+    txt = re.sub(u'&#252;|&uuml;|ü',     r'{u"}',  txt)  # u-umlaut
+    txt = re.sub(u'&#253;|&yacute;|ý',   r"{y'}",  txt)  # y-acute
+    txt = re.sub(u'&#255;|&yuml;|ÿ',     r'{y"}',  txt)  # y-umlaut
+    txt = re.sub(u'&#338;|&OElig;|Œ',    r'{OE}',  txt)  # OE
+    txt = re.sub(u'&#339;|&oelig;|œ',    r'{oe}',  txt)  # oe
+    txt = re.sub(u'&#348;|&Scaron;|Ŝ',   r'{S^}', txt)  # Scaron
+    txt = re.sub(u'&#349;|&scaron;|ŝ',   r'{s^}', txt)  # scaron
+    txt = re.sub(u'&#8226;|&bull;|•',    r'{*}',   txt)  # bullet
+    txt = re.sub(u'&#8355;|₣',           r'{Fr}',  txt)  # Franc
+    txt = re.sub(u'&#8356;|₤',           r'{L=}',  txt)  # Lira
+    txt = re.sub(u'&#8360;|₨',           r'{Rs}',  txt)  # Rupee
+    txt = re.sub(u'&#8364;|&euro;|€',    r'{C=}',  txt)  # euro
+    txt = re.sub(u'&#8482;|&trade;|™',   r'{tm}',  txt)  # trademark
+    txt = re.sub(u'&#9824;|&spades;|♠',  r'{spade}',   txt)  # spade
+    txt = re.sub(u'&#9827;|&clubs;|♣',   r'{club}',    txt)  # club
+    txt = re.sub(u'&#9829;|&hearts;|♥',  r'{heart}',   txt)  # heart
+    txt = re.sub(u'&#9830;|&diams;|♦',   r'{diamond}', txt)  # diamond
+
+    txt = re.sub(u'\xa0',   r'p. ', txt)              # blank paragraph
+    txt = re.sub(u'\n\n\n\n',   r'\n\np. \n\n', txt)  # blank paragraph
+    txt = re.sub(u'\n  \n',   r'\n<br />\n', txt)     # blank paragraph - br tag
+
+    return txt

From be3d441d3bb4705fc24261312644ae148a0581c4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 19 Apr 2011 06:49:27 -0400
Subject: [PATCH 02/25] More textile work.

---
 src/calibre/ebooks/txt/output.py    |   1 -
 src/calibre/ebooks/txt/textileml.py | 125 +++++++++++++++++++---------
 src/calibre/ebooks/txt/unsmarten.py |  11 ++-
 3 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 7b50afb345..606dec4a63 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -70,7 +70,6 @@ class TXTOutput(OutputFormatPlugin):
      ])
 
     def convert(self, oeb_book, output_path, input_plugin, opts, log):
-        print 'New'
         if opts.txt_output_formatting.lower() == 'markdown':
             from calibre.ebooks.txt.markdownml import MarkdownMLizer
             self.writer = MarkdownMLizer(log)
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 9651fa8971..9a025e0aef 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -58,31 +58,39 @@ class TextileMLizer(OEB2HTML):
         return ''.join(output)
 
     def tidy_up(self, text):
-        def check_count(text, tests):
-            x = []
-            for i, t in enumerate(reversed(tests)):
-                x.append((text.count(t), i, t))
-            if x:
-                return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
-            return ''
+#        def check_count(text, tests):
+#            x = []
+#            for i, t in enumerate(reversed(tests)):
+#                x.append((text.count(t), i, t))
+#            if x:
+#                return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
+#            return ''
 
-        # NEEDS TWEAKING
-#        def check_escaping(text, tests):
-#            for t in tests:
-#                text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text)
-#                text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text)
-#            return text
+        # Needs tweaking and finetuning - don't use yet.
+        def check_escaping(text, tests):
+            for t in tests:
+                text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
+#                text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
+#                text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
+#                text = re.sub(r'(["\'])\[('+t+'\w+'+t+')\]', r'\1\2', text)
+#                text = re.sub(r'\[('+t+'\w+'+t+')\](["\',\.!\?])', r'\1\2', text)
+            return text
 
-        txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
-        text = re.sub(txt+'(\S)', r'\n\1', text)
+#        txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
+#        text = re.sub(txt+'(\S)', r'\n\1', text)
 
-#        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
+        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
 
-        text = re.sub('\npre\. bc\.', '\nbc.', text)
-        text = re.sub('\np=. p. ', '\np. ', text)
-        text = re.sub('\np=. \n', '\n', text)
-        text = re.sub('\n{3,}', '\n\n', text)
-        text = re.sub(' \|', '|', text)
+        text = re.sub(r'^\n+', r'', text)
+        text = re.sub(r'\npre\. bc\.', r'\nbc.', text)
+        text = re.sub(r'\nbq\. \n\np\. ', r'\nbq. ', text)
+        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)
+        text = re.sub(r'\n{3}', r'\n\n', text)
+        text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text)
+        text = re.sub(r'p.*\. \n\n', r'', text)
+#        text = re.sub(u'\n  \n',   r'\n<br />\n', text)     # blank paragraph - br tag
+        text = re.sub(u'p.*\. \xa0',   r'p. ', text)              # blank paragraph
+        text = re.sub(r' \|', r'|', text)
 
         # started work on trying to fix footnotes
 #        text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text)
@@ -94,20 +102,29 @@ class TextileMLizer(OEB2HTML):
         text = text.replace('\r', ' ')
         # Condense redundant spaces created by replacing newlines with spaces.
         text = re.sub(r'[ ]{2,}', ' ', text)
-        text = re.sub(r'\t+', '', text)
+        text = re.sub(r'\t +', '', text)
+#        text = re.sub(r'\n +', '', text)
         return text
 
     def remove_leading_ws(self, text):
         text = text.replace('\r\n', '\n')
         text = text.replace('\r', '\n')
-        text = re.sub(r'\n+', '\n', text)
         text = re.sub(r'\n[\t ]+', '\n', text)
+        text = re.sub(r'\n{2,}', '\n', text)
         return text
 
-    def check_align(self, style, align, tests):
+    def check_halign(self, style):
+        tests = {'left':'<','justify':'<>','center':'=','right':'>'}
         for i in tests:
-            if style[align] == i[0]:
-                return i[1]
+            if style['text-align'] == i:
+                return tests[i]
+        return ''
+
+    def check_valign(self, style):
+        tests = {'top':'^','bottom':'~', 'middle':'-'}
+        for i in tests:
+            if style['vertical-align'] == i:
+                return tests[i]
         return ''
 
     def check_padding(self, style, tests):
@@ -124,15 +141,16 @@ class TextileMLizer(OEB2HTML):
     def check_id_tag(self, attribs):
         txt = ''
         if attribs.has_key('id'):
-            txt = '(#'+attribs['id']+')'
+            #if attribs['id'] in self.links:
+                txt = '(#'+attribs['id']+')'
         return txt
 
     def build_block(self, tag, style, attribs, finish):
-        txt = tag
+        txt = '\n' + tag
         if self.opts.keep_links:
             txt += self.check_id_tag(attribs)
         txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
-        txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
+        txt += self.check_halign(style)
         txt += finish
         return txt
 
@@ -163,7 +181,17 @@ class TextileMLizer(OEB2HTML):
            or style['visibility'] == 'hidden':
             return ['']
 
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
+        # Soft scene breaks.
+        text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0']))
+
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
+            #For debugging
+            if tag == 'h1':
+                for i in self.links:
+                    text.append(i)
+                    text.append('\n')
+            if tag == 'div':
+                tag = 'p'
             text.append(self.build_block(tag, style, attribs, '. '))
             tags.append('\n')
 
@@ -191,10 +219,10 @@ class TextileMLizer(OEB2HTML):
                 tags.append('-')
                 self.style_strike = True
         if style['font-variant'] == 'small-caps':
-            if self.style_smallcap == 0:
+            if self.style_smallcap == False:
                 text.append('&')
                 tags.append('&')
-                self.style_smallcap = 1
+                self.style_smallcap = True
         if tag == 'br':
             text.append('')
             tags.append('\n')
@@ -236,7 +264,10 @@ class TextileMLizer(OEB2HTML):
                     tags.append('(' + attribs['title'] + ')')
         elif tag == 'img':
             if self.opts.keep_image_references:
-                text.append ('!' + attribs['src'])
+                txt = '!' + self.check_halign(style)
+                txt += self.check_valign(style)
+                txt += attribs['src']
+                text.append(txt)
                 if attribs.has_key('alt'):
                     txt = attribs['alt']
                     if txt != '':
@@ -247,6 +278,7 @@ class TextileMLizer(OEB2HTML):
             text.append('')
             tags.append(tag)
         elif tag == 'li':
+#            text.append('\n')
             if self.list: li = self.list[-1]
             else: li = {'name':'ul', 'num':0}
             if   li['name'] == 'ul': text.append('*'*len(self.list)+' ')
@@ -273,8 +305,8 @@ class TextileMLizer(OEB2HTML):
         elif tag == 'td':
             text.append('|')
             txt = ''
-            txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
-            txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']])
+            txt += self.check_halign(style)
+            txt += self.check_valign(style)
             if attribs.has_key ('colspan'):
                 txt += '\\' + attribs['colspan']
             if attribs.has_key ('rowspan'):
@@ -288,7 +320,10 @@ class TextileMLizer(OEB2HTML):
 
         if self.opts.keep_links and attribs.has_key('id'):
             if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
-                text.append('(#' + attribs['id'] + ')')
+                if tag == 'span':
+                    text.append(' %')
+                    tags.append('% ')
+                text.append('(#' + attribs['id'] + u')\xa0')
 
         # If wanted process all style tags here - before taxt in tags is written
 
@@ -318,11 +353,19 @@ class TextileMLizer(OEB2HTML):
                     if self.list: self.list.pop()
             else:
                 text.append('%s' % t)
-                if t == '*': self.style_bold = False
-                if t == '_': self.style_italic = False
-                if t == '+': self.style_under = False
-                if t == '-': self.style_strike = False
-                if t == '&': self.style_smallcap = False
+                if t == '*':
+                    self.style_bold = False
+                if t == '_':
+                    self.style_italic = False
+                if t == '+':
+                    self.style_under = False
+                if t == '-':
+                    self.style_strike = False
+                if t == '&':
+                    self.style_smallcap = False
+
+        # Soft scene breaks.
+        text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0']))
 
         # Add the text that is outside of the tag.
         if hasattr(elem, 'tail') and elem.tail:
diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py
index 30a22bf069..40444ba601 100644
--- a/src/calibre/ebooks/txt/unsmarten.py
+++ b/src/calibre/ebooks/txt/unsmarten.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 
-'''
-
-'''
+"""unsmarten : html2textile helper function"""
 
 __version__ = '0.1'
 __author__ = 'Leigh Parry'
@@ -102,8 +100,9 @@ def unsmarten(txt):
     txt = re.sub(u'&#9829;|&hearts;|♥',  r'{heart}',   txt)  # heart
     txt = re.sub(u'&#9830;|&diams;|♦',   r'{diamond}', txt)  # diamond
 
-    txt = re.sub(u'\xa0',   r'p. ', txt)              # blank paragraph
-    txt = re.sub(u'\n\n\n\n',   r'\n\np. \n\n', txt)  # blank paragraph
-    txt = re.sub(u'\n  \n',   r'\n<br />\n', txt)     # blank paragraph - br tag
+    # Move into main code?
+#    txt = re.sub(u'\xa0',   r'p. ', txt)              # blank paragraph
+#    txt = re.sub(u'\n\n\n\n',   r'\n\np. \n\n', txt)  # blank paragraph
+#    txt = re.sub(u'\n  \n',   r'\n<br />\n', txt)     # blank paragraph - br tag
 
     return txt

From fabef627e3dd85d06989551614db5277e72021c7 Mon Sep 17 00:00:00 2001
From: Byron Li <byron_li@nj-byron-li-2>
Date: Mon, 25 Apr 2011 21:11:24 +0800
Subject: [PATCH 03/25] Add a douban.com plugin stub. Not working yet.

---
 src/calibre/customize/builtins.py             |   5 +-
 src/calibre/ebooks/metadata/sources/douban.py | 361 ++++++++++++++++++
 2 files changed, 364 insertions(+), 2 deletions(-)
 create mode 100644 src/calibre/ebooks/metadata/sources/douban.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index c27fa2a57b..3c769f8dc7 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -628,8 +628,9 @@ if test_eight_code:
     from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
     from calibre.ebooks.metadata.sources.isbndb import ISBNDB
     from calibre.ebooks.metadata.sources.overdrive import OverDrive
-
-    plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
+    from calibre.ebooks.metadata.sources.douban import Douban
+    
+    plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
 
 # }}}
 else:
diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py
new file mode 100644
index 0000000000..b50bb6ff85
--- /dev/null
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@@ -0,0 +1,361 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
+__docformat__ = 'restructuredtext en'
+
+import time, hashlib
+from urllib import urlencode
+from functools import partial
+from Queue import Queue, Empty
+
+from lxml import etree
+
+from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata.sources.base import Source
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.utils.date import parse_date, utcnow
+from calibre.utils.cleantext import clean_ascii_chars
+from calibre import as_unicode
+
+NAMESPACES = {
+              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
+              'atom' : 'http://www.w3.org/2005/Atom',
+              'dc'   : 'http://purl.org/dc/terms',
+              'gd'   : 'http://schemas.google.com/g/2005'
+            }
+
+NAMESPACES = {
+              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
+              'atom' : 'http://www.w3.org/2005/Atom',
+              'db': 'http://www.douban.com/xmlns/'
+            }
+XPath = partial(etree.XPath, namespaces=NAMESPACES)
+total_results  = XPath('//openSearch:totalResults')
+start_index    = XPath('//openSearch:startIndex')
+items_per_page = XPath('//openSearch:itemsPerPage')
+entry          = XPath('//atom:entry')
+entry_id       = XPath('descendant::atom:id')
+title          = XPath('descendant::atom:title')
+description    = XPath('descendant::atom:summary')
+publisher      = XPath("descendant::db:attribute[@name='publisher']")
+isbn           = XPath("descendant::db:attribute[@name='isbn13']")
+date           = XPath("descendant::db:attribute[@name='pubdate']")
+creator        = XPath("descendant::db:attribute[@name='author']")
+tag            = XPath("descendant::db:tag")
+
+def get_details(browser, url, timeout): # {{{
+    try:
+        raw = browser.open_novisit(url, timeout=timeout).read()
+    except Exception as e:
+        gc = getattr(e, 'getcode', lambda : -1)
+        if gc() != 403:
+            raise
+        # Google is throttling us, wait a little
+        time.sleep(2)
+        raw = browser.open_novisit(url, timeout=timeout).read()
+
+    return raw
+# }}}
+
+def to_metadata(browser, log, entry_, timeout): # {{{
+
+    def get_text(extra, x):
+        try:
+            ans = x(extra)
+            if ans:
+                ans = ans[0].text
+                if ans and ans.strip():
+                    return ans.strip()
+        except:
+            log.exception('Programming error:')
+        return None
+
+
+    id_url = entry_id(entry_)[0].text
+    google_id = id_url.split('/')[-1]
+    title_ = ': '.join([x.text for x in title(entry_)]).strip()
+    authors = [x.text.strip() for x in creator(entry_) if x.text]
+    if not authors:
+        authors = [_('Unknown')]
+    if not id_url or not title:
+        # Silently discard this entry
+        return None
+
+    mi = Metadata(title_, authors)
+    mi.identifiers = {'google':google_id}
+    try:
+        raw = get_details(browser, id_url, timeout)
+        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
+            strip_encoding_pats=True)[0])
+        extra = entry(feed)[0]
+    except:
+        log.exception('Failed to get additional details for', mi.title)
+        return mi
+
+    mi.comments = get_text(extra, description)
+    #mi.language = get_text(extra, language)
+    mi.publisher = get_text(extra, publisher)
+
+    # ISBN
+    isbns = []
+    for x in identifier(extra):
+        t = str(x.text).strip()
+        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
+            if t[:5].upper() == 'ISBN:':
+                t = check_isbn(t[5:])
+                if t:
+                    isbns.append(t)
+    if isbns:
+        mi.isbn = sorted(isbns, key=len)[-1]
+    mi.all_isbns = isbns
+
+    # Tags
+    try:
+        btags = [x.text for x in subject(extra) if x.text]
+        tags = []
+        for t in btags:
+            atags = [y.strip() for y in t.split('/')]
+            for tag in atags:
+                if tag not in tags:
+                    tags.append(tag)
+    except:
+        log.exception('Failed to parse tags:')
+        tags = []
+    if tags:
+        mi.tags = [x.replace(',', ';') for x in tags]
+
+    # pubdate
+    pubdate = get_text(extra, date)
+    if pubdate:
+        try:
+            default = utcnow().replace(day=15)
+            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
+        except:
+            log.error('Failed to parse pubdate %r'%pubdate)
+
+    # Ratings
+    for x in rating(extra):
+        try:
+            mi.rating = float(x.get('average'))
+            if mi.rating > 5:
+                mi.rating /= 2
+        except:
+            log.exception('Failed to parse rating')
+
+    # Cover
+    mi.has_google_cover = None
+    for x in extra.xpath(
+            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
+        mi.has_google_cover = x.get('href')
+        break
+
+    return mi
+# }}}
+
+class Douban(Source):
+
+    name = 'Douban Books'
+    author = _('Li Fanxi')
+    
+    description = _('Downloads metadata from Douban.com')
+
+    capabilities = frozenset(['identify', 'cover'])
+    touched_fields = frozenset(['title', 'authors', 'tags', 
+        'comments', 'publisher', 'identifier:isbn', 'rating',
+        'identifier:douban']) # language currently disabled
+    supports_gzip_transfer_encoding = True
+    cached_cover_url_is_reliable = True
+
+    DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+#    GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
+
+#    DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
+
+    def get_book_url(self, identifiers): # {{{
+        db = identifiers.get('douban', None)
+        if db is not None:
+            return db
+        else:
+            return None
+    # }}}
+
+    def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
+        SEARCH_URL = 'http://api.douban.com/book/subjects?'
+        ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
+
+        q = ''
+        t = None
+        isbn = check_isbn(identifiers.get('isbn', None))
+        if isbn is not None:
+            q = isbn
+            t = 'isbn'
+        elif title or authors:
+            def build_term(prefix, parts):
+                return ' '.join(x for x in parts)
+            title_tokens = list(self.get_title_tokens(title))
+            if title_tokens:
+                q += build_term('title', title_tokens)
+            author_tokens = self.get_author_tokens(authors,
+                    only_first_author=True)
+            if author_tokens:
+                q += ((' ' if q != '' else '') + 
+                    build_term('author', author_tokens))
+            t = 'search'
+        if isinstance(q, unicode):
+            q = q.encode('utf-8')
+        if not q:
+            return None
+        print(q)
+        url = None
+        if t == "isbn":
+            url = ISBN_URL + q
+        else:
+            url = SEARCH_URL + urlencode({
+                    'q': q,
+                    })
+        if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
+            url = url + "?apikey=" + self.DOUBAN_API_KEY
+        print(url)
+        return url
+    # }}}
+
+    def download_cover(self, log, result_queue, abort, # {{{
+            title=None, authors=None, identifiers={}, timeout=30):
+        cached_url = self.get_cached_cover_url(identifiers)
+        if cached_url is None:
+            log.info('No cached cover found, running identify')
+            rq = Queue()
+            self.identify(log, rq, abort, title=title, authors=authors,
+                    identifiers=identifiers)
+            if abort.is_set():
+                return
+            results = []
+            while True:
+                try:
+                    results.append(rq.get_nowait())
+                except Empty:
+                    break
+            results.sort(key=self.identify_results_keygen(
+                title=title, authors=authors, identifiers=identifiers))
+            for mi in results:
+                cached_url = self.get_cached_cover_url(mi.identifiers)
+                if cached_url is not None:
+                    break
+        if cached_url is None:
+            log.info('No cover found')
+            return
+
+        if abort.is_set():
+            return
+        br = self.browser
+        log('Downloading cover from:', cached_url)
+        try:
+            cdata = br.open_novisit(cached_url, timeout=timeout).read()
+            if cdata:
+                if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:
+                    log.warning('Google returned a dummy image, ignoring')
+                else:
+                    result_queue.put((self, cdata))
+        except:
+            log.exception('Failed to download cover from:', cached_url)
+
+    # }}}
+
+    def get_cached_cover_url(self, identifiers): # {{{
+        url = None
+        goog = identifiers.get('google', None)
+        if goog is None:
+            isbn = identifiers.get('isbn', None)
+            if isbn is not None:
+                goog = self.cached_isbn_to_identifier(isbn)
+        if goog is not None:
+            url = self.cached_identifier_to_cover_url(goog)
+
+        return url
+    # }}}
+
+    def get_all_details(self, br, log, entries, abort, # {{{
+            result_queue, timeout):
+        for relevance, i in enumerate(entries):
+            try:
+                ans = to_metadata(br, log, i, timeout)
+                if isinstance(ans, Metadata):
+                    ans.source_relevance = relevance
+                    goog = ans.identifiers['google']
+                    for isbn in getattr(ans, 'all_isbns', []):
+                        self.cache_isbn_to_identifier(isbn, goog)
+                    if ans.has_google_cover:
+                        self.cache_identifier_to_cover_url(goog,
+                                self.GOOGLE_COVER%goog)
+                    self.clean_downloaded_metadata(ans)
+                    result_queue.put(ans)
+            except:
+                log.exception(
+                    'Failed to get metadata for identify entry:',
+                    etree.tostring(i))
+            if abort.is_set():
+                break
+    # }}}
+
+    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
+            identifiers={}, timeout=30):
+        query = self.create_query(log, title=title, authors=authors,
+                identifiers=identifiers)
+        if not query:
+            log.error('Insufficient metadata to construct query')
+            return
+        br = self.browser
+        try:
+            raw = br.open_novisit(query, timeout=timeout).read()
+        except Exception as e:
+            log.exception('Failed to make identify query: %r'%query)
+            return as_unicode(e)
+
+        try:
+            parser = etree.XMLParser(recover=True, no_network=True)
+            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
+                strip_encoding_pats=True)[0], parser=parser)
+            entries = entry(feed)
+        except Exception as e:
+            log.exception('Failed to parse identify results')
+            return as_unicode(e)
+
+        if not entries and identifiers and title and authors and \
+                not abort.is_set():
+            return self.identify(log, result_queue, abort, title=title,
+                    authors=authors, timeout=timeout)
+
+        # There is no point running these queries in threads as google
+        # throttles requests returning 403 Forbidden errors
+        self.get_all_details(br, log, entries, abort, result_queue, timeout)
+
+        return None
+    # }}}
+
+if __name__ == '__main__': # tests {{{
+    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
+    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
+            title_test, authors_test)
+    test_identify_plugin(GoogleBooks.name,
+        [
+
+
+            (
+                {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby',
+                    'authors':['Fitzgerald']},
+                [title_test('The great gatsby', exact=True),
+                    authors_test(['Francis Scott Fitzgerald'])]
+            ),
+
+            (
+                {'title': 'Flatland', 'authors':['Abbott']},
+                [title_test('Flatland', exact=False)]
+            ),
+    ])
+# }}}
+

From ea4b5b9054765bb737179d904c9168846def2e45 Mon Sep 17 00:00:00 2001
From: Byron Li <byron_li@nj-byron-li-2>
Date: Fri, 29 Apr 2011 16:29:57 +0800
Subject: [PATCH 04/25] First working version of Douban book plugin.

---
 src/calibre/ebooks/metadata/sources/douban.py | 83 +++++++++----------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py
index b50bb6ff85..8f1794b33f 100644
--- a/src/calibre/ebooks/metadata/sources/douban.py
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@@ -25,14 +25,8 @@ from calibre import as_unicode
 NAMESPACES = {
               'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
               'atom' : 'http://www.w3.org/2005/Atom',
-              'dc'   : 'http://purl.org/dc/terms',
-              'gd'   : 'http://schemas.google.com/g/2005'
-            }
-
-NAMESPACES = {
-              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
-              'atom' : 'http://www.w3.org/2005/Atom',
-              'db': 'http://www.douban.com/xmlns/'
+              'db': 'http://www.douban.com/xmlns/',
+              'gd': 'http://schemas.google.com/g/2005'
             }
 XPath = partial(etree.XPath, namespaces=NAMESPACES)
 total_results  = XPath('//openSearch:totalResults')
@@ -47,6 +41,8 @@ isbn           = XPath("descendant::db:attribute[@name='isbn13']")
 date           = XPath("descendant::db:attribute[@name='pubdate']")
 creator        = XPath("descendant::db:attribute[@name='author']")
 tag            = XPath("descendant::db:tag")
+rating         = XPath("descendant::gd:rating[@name='average']")
+cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")
 
 def get_details(browser, url, timeout): # {{{
     try:
@@ -77,7 +73,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
 
 
     id_url = entry_id(entry_)[0].text
-    google_id = id_url.split('/')[-1]
+    douban_id = id_url.split('/')[-1]
     title_ = ': '.join([x.text for x in title(entry_)]).strip()
     authors = [x.text.strip() for x in creator(entry_) if x.text]
     if not authors:
@@ -87,7 +83,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
         return None
 
     mi = Metadata(title_, authors)
-    mi.identifiers = {'google':google_id}
+    mi.identifiers = {'douban':douban_id}
     try:
         raw = get_details(browser, id_url, timeout)
         feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@@ -103,13 +99,9 @@ def to_metadata(browser, log, entry_, timeout): # {{{
 
     # ISBN
     isbns = []
-    for x in identifier(extra):
-        t = str(x.text).strip()
-        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
-            if t[:5].upper() == 'ISBN:':
-                t = check_isbn(t[5:])
-                if t:
-                    isbns.append(t)
+    for x in [t.text for t in isbn(extra)]:
+        if check_isbn(x):
+            isbns.append(x)
     if isbns:
         mi.isbn = sorted(isbns, key=len)[-1]
     mi.all_isbns = isbns
@@ -139,21 +131,23 @@ def to_metadata(browser, log, entry_, timeout): # {{{
             log.error('Failed to parse pubdate %r'%pubdate)
 
     # Ratings
-    for x in rating(extra):
+    if rating(extra):
         try:
-            mi.rating = float(x.get('average'))
-            if mi.rating > 5:
-                mi.rating /= 2
+            mi.rating = float(rating(extra).text) / 2.0
         except:
             log.exception('Failed to parse rating')
+            mi.rating = 0
 
     # Cover
-    mi.has_google_cover = None
-    for x in extra.xpath(
-            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
-        mi.has_google_cover = x.get('href')
-        break
-
+    mi.has_douban_cover = None
+    u = cover_url(extra)
+    print(u)
+    if u:
+        u = u[0].replace('/spic/', '/lpic/');
+        print(u)
+        # If URL contains "book-default", the book doesn't have a cover
+        if u.find('book-default') == -1:
+            mi.has_douban_cover = u
     return mi
 # }}}
 
@@ -172,6 +166,7 @@ class Douban(Source):
     cached_cover_url_is_reliable = True
 
     DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+    DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s'
 #    GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
 
 #    DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
@@ -179,7 +174,7 @@ class Douban(Source):
     def get_book_url(self, identifiers): # {{{
         db = identifiers.get('douban', None)
         if db is not None:
-            return db
+            return DOUBAN_ID_URL % db
         else:
             return None
     # }}}
@@ -206,11 +201,11 @@ class Douban(Source):
                 q += ((' ' if q != '' else '') + 
                     build_term('author', author_tokens))
             t = 'search'
+        q = q.strip()
         if isinstance(q, unicode):
             q = q.encode('utf-8')
         if not q:
             return None
-        print(q)
         url = None
         if t == "isbn":
             url = ISBN_URL + q
@@ -220,7 +215,6 @@ class Douban(Source):
                     })
         if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
             url = url + "?apikey=" + self.DOUBAN_API_KEY
-        print(url)
         return url
     # }}}
 
@@ -257,10 +251,7 @@ class Douban(Source):
         try:
             cdata = br.open_novisit(cached_url, timeout=timeout).read()
             if cdata:
-                if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:
-                    log.warning('Google returned a dummy image, ignoring')
-                else:
-                    result_queue.put((self, cdata))
+                result_queue.put((self, cdata))
         except:
             log.exception('Failed to download cover from:', cached_url)
 
@@ -268,13 +259,13 @@ class Douban(Source):
 
     def get_cached_cover_url(self, identifiers): # {{{
         url = None
-        goog = identifiers.get('google', None)
-        if goog is None:
+        db = identifiers.get('douban', None)
+        if db is None:
             isbn = identifiers.get('isbn', None)
             if isbn is not None:
-                goog = self.cached_isbn_to_identifier(isbn)
-        if goog is not None:
-            url = self.cached_identifier_to_cover_url(goog)
+                db = self.cached_isbn_to_identifier(isbn)
+        if db is not None:
+            url = self.cached_identifier_to_cover_url(db)
 
         return url
     # }}}
@@ -286,12 +277,12 @@ class Douban(Source):
                 ans = to_metadata(br, log, i, timeout)
                 if isinstance(ans, Metadata):
                     ans.source_relevance = relevance
-                    goog = ans.identifiers['google']
+                    db = ans.identifiers['douban']
                     for isbn in getattr(ans, 'all_isbns', []):
-                        self.cache_isbn_to_identifier(isbn, goog)
-                    if ans.has_google_cover:
-                        self.cache_identifier_to_cover_url(goog,
-                                self.GOOGLE_COVER%goog)
+                        self.cache_isbn_to_identifier(isbn, db)
+                    if ans.has_douban_cover:
+                        self.cache_identifier_to_cover_url(db,
+                                ans.has_douban_cover)
                     self.clean_downloaded_metadata(ans)
                     result_queue.put(ans)
             except:
@@ -315,7 +306,6 @@ class Douban(Source):
         except Exception as e:
             log.exception('Failed to make identify query: %r'%query)
             return as_unicode(e)
-
         try:
             parser = etree.XMLParser(recover=True, no_network=True)
             feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@@ -324,7 +314,8 @@ class Douban(Source):
         except Exception as e:
             log.exception('Failed to parse identify results')
             return as_unicode(e)
-
+        if not title:
+            title = ""
         if not entries and identifiers and title and authors and \
                 not abort.is_set():
             return self.identify(log, result_queue, abort, title=title,

From ff6043ce0f0659edce0c05e7e669f5e9c106ea96 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 30 Apr 2011 08:44:30 -0400
Subject: [PATCH 05/25] ...

---
 src/calibre/ebooks/textile/functions.py |  12 +-
 src/calibre/ebooks/txt/textileml.py     | 202 ++++++++++++++----------
 2 files changed, 123 insertions(+), 91 deletions(-)

diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py
index 39f793face..dd1914cf9f 100755
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@@ -12,7 +12,7 @@ A Humane Web Text Generator
 #__date__ = '2009/12/04'
 
 __copyright__ = """
-Copyright (c) 2011, Leigh Parry
+Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
 Copyright (c) 2011, John Schember <john@nachtimwald.com>
 Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
 Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
@@ -225,8 +225,8 @@ class Textile(object):
         (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'),         r'<span class="caps">\1</span>'),       #  3+ uppercase
         (re.compile(r'\b(\s{0,1})?\.{3}'),                             r'\1&#8230;'),                          #  ellipsis
         (re.compile(r'^[\*_-]{3,}$', re.M),                            r'<hr />'),                             #  <hr> scene-break
-        (re.compile(r'\b--\b'),                                        r'&#8212;'),                            #  em dash
-        (re.compile(r'(\s)--(\s)'),                                    r'\1&#8212;\2'),                        #  em dash
+#        (re.compile(r'\b--\b'),                                        r'&#8212;'),                            #  em dash
+        (re.compile(r'([^-])--([^-])'),                                r'\1&#8212;\2'),                        #  em dash
         (re.compile(r'\s-(?:\s|$)'),                                   r' &#8211; '),                          #  en dash
         (re.compile(r'\b( ?)[([]TM[])]', re.I),                        r'\1&#8482;'),                          #  trademark
         (re.compile(r'\b( ?)[([]R[])]', re.I),                         r'\1&#174;'),                           #  registered
@@ -868,7 +868,7 @@ class Textile(object):
         >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
         'hello <span class="bob">span <strong>strong</strong> and <b>bold</b></span> goodbye'
         """
-        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
+        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&')
         pnct = ".,\"'?!;:"
 
         for qtag in qtags:
@@ -900,7 +900,9 @@ class Textile(object):
             '%' : 'span',
             '+' : 'ins',
             '~' : 'sub',
-            '^' : 'sup'
+            '^' : 'sup',
+            '&' : 'span style="font-variant:small-caps;"'
+#            '&' : 'span style="font-transform:uppercase;font-size:smaller;"'
         }
         tag = qtags[tag]
         atts = self.pba(atts)
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 9a025e0aef..42b709a681 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -28,15 +28,18 @@ class TextileMLizer(OEB2HTML):
         self.in_table = False
         self.links = {}
         self.list = []
+        self.our_links = []
+        self.our_ids = []
         self.images = {}
+        self.remove_space_after_newline = False
         self.base_hrefs = [item.href for item in oeb_book.spine]
         self.map_resources(oeb_book)
 
-        self.style_bold = False
-        self.style_italic = False
-        self.style_under = False
-        self.style_strike = False
-        self.style_smallcap = False
+#        self.style_bold = False
+#        self.style_italic = False
+#        self.style_under = False
+#        self.style_strike = False
+#        self.style_smallcap = False
 
         txt = self.mlize_spine(oeb_book)
         txt = unsmarten(txt)
@@ -58,42 +61,41 @@ class TextileMLizer(OEB2HTML):
         return ''.join(output)
 
     def tidy_up(self, text):
-#        def check_count(text, tests):
-#            x = []
-#            for i, t in enumerate(reversed(tests)):
-#                x.append((text.count(t), i, t))
-#            if x:
-#                return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
-#            return ''
-
-        # Needs tweaking and finetuning - don't use yet.
+        # Needs tweaking and finetuning
         def check_escaping(text, tests):
             for t in tests:
                 text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
-#                text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
-#                text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
-#                text = re.sub(r'(["\'])\[('+t+'\w+'+t+')\]', r'\1\2', text)
-#                text = re.sub(r'\[('+t+'\w+'+t+')\](["\',\.!\?])', r'\1\2', text)
+                text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
+                text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
             return text
 
-#        txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
-#        text = re.sub(txt+'(\S)', r'\n\1', text)
-
-        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
+        # Note - I'm not checking for escaped '-' as this will also get hypenated words
+        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
 
+        text = re.sub(r' +\n', r'\n', text)
         text = re.sub(r'^\n+', r'', text)
-        text = re.sub(r'\npre\. bc\.', r'\nbc.', text)
-        text = re.sub(r'\nbq\. \n\np\. ', r'\nbq. ', text)
+        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
+        text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text)
         text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)
         text = re.sub(r'\n{3}', r'\n\n', text)
         text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text)
         text = re.sub(r'p.*\. \n\n', r'', text)
-#        text = re.sub(u'\n  \n',   r'\n<br />\n', text)     # blank paragraph - br tag
         text = re.sub(u'p.*\. \xa0',   r'p. ', text)              # blank paragraph
         text = re.sub(r' \|', r'|', text)
+        # Now put back spaces removed earlier as they're needed here
+        text = re.sub(r'\np\.\n', r'\np. \n', text)
+        
+        # Now tidyup links and ids - remove ones that don't have a correponding opposite
+        if self.opts.keep_links:
+            for i in self.our_links:
+                if i not in self.our_ids:
+                    text = re.sub(r'"(.+)":'+i, '\1', text)
+            for i in self.our_ids:
+                if i not in self.our_links:
+                    text = re.sub(r'\('+i+'\)', '', text)
 
         # started work on trying to fix footnotes
-#        text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text)
+#        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
         return text
 
     def remove_newlines(self, text):
@@ -102,16 +104,30 @@ class TextileMLizer(OEB2HTML):
         text = text.replace('\r', ' ')
         # Condense redundant spaces created by replacing newlines with spaces.
         text = re.sub(r'[ ]{2,}', ' ', text)
-        text = re.sub(r'\t +', '', text)
-#        text = re.sub(r'\n +', '', text)
+        text = re.sub(r'\t+', '', text)
+        if self.remove_space_after_newline == True:
+            text = re.sub(r'^ +', '', text)
+            self.remove_space_after_newline = False
         return text
 
-    def remove_leading_ws(self, text):
-        text = text.replace('\r\n', '\n')
-        text = text.replace('\r', '\n')
-        text = re.sub(r'\n[\t ]+', '\n', text)
-        text = re.sub(r'\n{2,}', '\n', text)
-        return text
+#    def remove_leading_ws(self, text):
+#        text = text.replace('\r\n', '\n')
+#        text = text.replace('\r', '\n')
+#        text = re.sub(r'\n[\t ]+', '\n', text)
+#        text = re.sub(r'\n{2,}', '\n', text)
+#        return text
+
+    def check_styles(self, style):
+        txt = '{'
+#        style_string = '%s;' % style
+#        txt += style_string
+        if style['color'] and style['color'] != 'black':
+            txt += 'color:'+style['color']+';'
+#        if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'):
+#            txt += 'font-size: %d;' % style['font-size']
+        txt += '}'
+        if txt == '{}': txt = ''
+        return txt
 
     def check_halign(self, style):
         tests = {'left':'<','justify':'<>','center':'=','right':'>'}
@@ -140,18 +156,18 @@ class TextileMLizer(OEB2HTML):
 
     def check_id_tag(self, attribs):
         txt = ''
-        if attribs.has_key('id'):
-            #if attribs['id'] in self.links:
-                txt = '(#'+attribs['id']+')'
+        if attribs.has_key('id'): # and attribs['id'] in self.links.values():
+                txt = '(#'+attribs['id']+ ')'
+                self.our_ids.append('#'+attribs['id'])
         return txt
 
-    def build_block(self, tag, style, attribs, finish):
+    def build_block(self, tag, style, attribs):
         txt = '\n' + tag
         if self.opts.keep_links:
             txt += self.check_id_tag(attribs)
         txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
         txt += self.check_halign(style)
-        txt += finish
+        txt += self.check_styles(style)
         return txt
 
     def dump_text(self, elem, stylizer, page, tag_stack=[]):
@@ -175,38 +191,35 @@ class TextileMLizer(OEB2HTML):
         tags = []
         tag = barename(elem.tag)
         attribs = elem.attrib
-
+        
         # Ignore anything that is set to not be displayed.
         if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
            or style['visibility'] == 'hidden':
             return ['']
 
-        # Soft scene breaks.
-        text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0']))
-
         if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
-            #For debugging
-            if tag == 'h1':
-                for i in self.links:
-                    text.append(i)
-                    text.append('\n')
             if tag == 'div':
                 tag = 'p'
-            text.append(self.build_block(tag, style, attribs, '. '))
+            text.append(self.build_block(tag, style, attribs))
+            text.append('. ')
             tags.append('\n')
 
-        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
-            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
-                if self.style_bold == False:
-                    text.append('*')
-                    tags.append('*')
-                    self.style_bold = True
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                 if self.style_italic == False:
                     text.append('_')
+#                    text.append('from '+tag)
                     tags.append('_')
                     self.style_italic = True
+        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
+                style_string = '%s;' % style
+                text.append(style_string)
+                if self.style_bold == False:
+                    text.append('*')
+#                    text.append('from '+tag)
+                    tags.append('*')
+                    self.style_bold = True
         if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
             if tag != 'a':
                 if self.style_under == False:
@@ -218,16 +231,12 @@ class TextileMLizer(OEB2HTML):
                 text.append('-')
                 tags.append('-')
                 self.style_strike = True
-        if style['font-variant'] == 'small-caps':
-            if self.style_smallcap == False:
-                text.append('&')
-                tags.append('&')
-                self.style_smallcap = True
         if tag == 'br':
             text.append('')
             tags.append('\n')
+            self.remove_space_after_newline = True
         elif tag == 'blockquote':
-            text.append('bq. ')
+            text.append('\nbq. ')
             tags.append('\n')
         elif tag in ('abbr', 'acronym'):
             text.append('')
@@ -241,8 +250,8 @@ class TextileMLizer(OEB2HTML):
             tags.append('~')
         elif tag == 'code':
             if self.in_pre:
-                text.append('bc. ')
-                tags.append('\n')
+                text.append('\nbc. ')
+                tags.append('')
             else:
                 text.append('@')
                 tags.append('@')
@@ -254,12 +263,14 @@ class TextileMLizer(OEB2HTML):
             tags.append('\n')
         elif tag == 'pre':
             self.in_pre = True
-            text.append('pre. ')
-            tags.append('pre')
+            text.append('\npre. ')
+            tags.append('pre\n')
         elif tag == 'a':
             if self.opts.keep_links:
                 text.append ('"')
-                tags.append('":' + attribs['href'])
+                if attribs.has_key('href'):
+                    tags.append('":' + attribs['href'])
+                    self.our_links.append(attribs['href'])
                 if attribs.has_key('title'):
                     tags.append('(' + attribs['title'] + ')')
         elif tag == 'img':
@@ -275,14 +286,15 @@ class TextileMLizer(OEB2HTML):
                 tags.append('!')
         elif tag in ('ol', 'ul'):
             self.list.append({'name':tag, 'num':0})
-            text.append('')
+            text.append('\n')
             tags.append(tag)
         elif tag == 'li':
-#            text.append('\n')
             if self.list: li = self.list[-1]
             else: li = {'name':'ul', 'num':0}
+            text.append('\n')
             if   li['name'] == 'ul': text.append('*'*len(self.list)+' ')
             elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
+            tags.append('\n')
         elif tag == 'dl':
             text.append('\n')
             tags.append('')
@@ -298,6 +310,7 @@ class TextileMLizer(OEB2HTML):
         elif tag == 'table':
             self.in_table = True
             text.append('')
+            tags.append('')
             tags.append('table')
         elif tag == 'tr':
             text.append('')
@@ -315,18 +328,33 @@ class TextileMLizer(OEB2HTML):
                 text.append(txt+'. ')
             tags.append('')
         elif tag == 'th':
-            text.append('|_. ')
+            text.append('|_')
+            
+            text.append('. ')
             tags.append('')
+        elif tag == 'span':
+            if style['font-variant'] == 'small-caps':
+                if self.style_smallcap == False:
+                    text.append('&')
+                    tags.append('&')
+                    self.style_smallcap = True
+            else:
+                txt = '%'
+                if self.opts.keep_links:
+                    txt += self.check_id_tag(attribs)
+                    txt += self.check_styles(style)
+                if txt != '%':
+                    text.append(txt)
+                    tags.append('%')
 
         if self.opts.keep_links and attribs.has_key('id'):
-            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
-                if tag == 'span':
-                    text.append(' %')
-                    tags.append('% ')
-                text.append('(#' + attribs['id'] + u')\xa0')
-
-        # If wanted process all style tags here - before taxt in tags is written
+            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'):
+                text.append(self.check_id_tag(attribs))
 
+        # Process the styles for any that we want to keep
+        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'):
+            text.append(self.check_styles(style))
+        
         # Process tags that contain text.
         if hasattr(elem, 'text') and elem.text:
             txt = elem.text
@@ -351,21 +379,23 @@ class TextileMLizer(OEB2HTML):
                     self.in_table = False
                 if tag in ('ul', 'ol'):
                     if self.list: self.list.pop()
+                    if not self.list: text.append('\n')
             else:
                 text.append('%s' % t)
-                if t == '*':
-                    self.style_bold = False
-                if t == '_':
-                    self.style_italic = False
-                if t == '+':
-                    self.style_under = False
-                if t == '-':
-                    self.style_strike = False
-                if t == '&':
-                    self.style_smallcap = False
+                if t == '*': self.style_bold = False
+                if t == '_': self.style_italic = False
+                if t == '+': self.style_under = False
+                if t == '-': self.style_strike = False
+                if t == '&': self.style_smallcap = False
 
         # Soft scene breaks.
         text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0']))
+#        try:
+#            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
+#            if ems >= 1:
+#                text.append('\n' * ems)
+#        except:
+#            pass
 
         # Add the text that is outside of the tag.
         if hasattr(elem, 'tail') and elem.tail:

From 05331d7f05de3ed3010a63b5c0d754452ee23782 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 30 Apr 2011 09:43:09 -0400
Subject: [PATCH 06/25] TXT: Textile changes.

---
 src/calibre/ebooks/txt/processor.py |   2 +
 src/calibre/ebooks/txt/textileml.py | 231 ++++++++++++++++------------
 2 files changed, 135 insertions(+), 98 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 7e161f63bd..54369190de 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -242,6 +242,8 @@ def detect_formatting_type(txt):
     textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
     # Links
     textile_count += len(re.findall(r'"[^"]*":\S+', txt))
+    # paragraph blocks
+    textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
 
     # Decide if either markdown or textile is used in the text
     # based on the number of unique formatting elements found.
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 42b709a681..622ff8d2e3 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
 '''
 Transform OEB content into Textile formatted plain text
 '''
-
 import re
 
 from functools import partial
@@ -16,8 +15,6 @@ from calibre.ebooks.htmlz.oeb2html import OEB2HTML
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
 from calibre.ebooks.oeb.stylizer import Stylizer
 from calibre.ebooks.txt.unsmarten import unsmarten
-from operator import itemgetter
-
 
 class TextileMLizer(OEB2HTML):
 
@@ -29,17 +26,20 @@ class TextileMLizer(OEB2HTML):
         self.links = {}
         self.list = []
         self.our_links = []
+        self.in_a_link = False
         self.our_ids = []
         self.images = {}
+        self.id_no_text = u''
+        self.style_embed = []
         self.remove_space_after_newline = False
         self.base_hrefs = [item.href for item in oeb_book.spine]
         self.map_resources(oeb_book)
 
-#        self.style_bold = False
-#        self.style_italic = False
-#        self.style_under = False
-#        self.style_strike = False
-#        self.style_smallcap = False
+        self.style_bold = False
+        self.style_italic = False
+        self.style_under = False
+        self.style_strike = False
+        self.style_smallcap = False
 
         txt = self.mlize_spine(oeb_book)
         txt = unsmarten(txt)
@@ -56,7 +56,7 @@ class TextileMLizer(OEB2HTML):
             self.rewrite_ids(item.data, item)
             rewrite_links(item.data, partial(self.rewrite_link, page=item))
             stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
-            output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
             output.append('\n\n')
         return ''.join(output)
 
@@ -64,36 +64,47 @@ class TextileMLizer(OEB2HTML):
         # Needs tweaking and finetuning
         def check_escaping(text, tests):
             for t in tests:
-                text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
+                # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
+                txt = '%s' % t
+                self.log.debug('DEBUG: ' + txt)
+                if txt != '%':
+                    text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
                 text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
                 text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
             return text
 
-        # Note - I'm not checking for escaped '-' as this will also get hypenated words
-        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
-
-        text = re.sub(r' +\n', r'\n', text)
-        text = re.sub(r'^\n+', r'', text)
-        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
-        text = re.sub(r'\nbq\.\n?\np\. ', r'\nbq. ', text)
-        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)
-        text = re.sub(r'\n{3}', r'\n\n', text)
-        text = re.sub(r'(p.*\. \n?)(p.*\. )', r'\2', text)
-        text = re.sub(r'p.*\. \n\n', r'', text)
-        text = re.sub(u'p.*\. \xa0',   r'p. ', text)              # blank paragraph
-        text = re.sub(r' \|', r'|', text)
-        # Now put back spaces removed earlier as they're needed here
-        text = re.sub(r'\np\.\n', r'\np. \n', text)
-        
         # Now tidyup links and ids - remove ones that don't have a correponding opposite
         if self.opts.keep_links:
             for i in self.our_links:
-                if i not in self.our_ids:
-                    text = re.sub(r'"(.+)":'+i, '\1', text)
+                if i[0] == '#':
+                    if i not in self.our_ids:
+                        text = re.sub(r'"(.+)":'+i, '\1', text)
             for i in self.our_ids:
                 if i not in self.our_links:
                     text = re.sub(r'\('+i+'\)', '', text)
+                    
+        # Note - I'm not checking for escaped '-' as this will also get hypenated words
+        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
 
+        text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
+        text = re.sub(r'%%', r'', text)                                 #remove empty spans
+        text = re.sub(r'%([_+*-]+)%', r'\1', text)                      #remove spans from tagged output
+        text = re.sub(r' +\n', r'\n', text)                             #remove spaces before a newline
+        text = re.sub(r'^\n+', r'', text)                               #remove newlines at top of file
+        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)              #correct blockcode paras
+        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)            #correct blockquote paras
+#        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
+        text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
+        text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
+        text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
+        text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
+        text = re.sub(u'\np.*\.\xa0',   r'\np. ', text)                # blank paragraph
+        text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
+        text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
+        # Now put back spaces removed earlier as they're needed here
+        text = re.sub(r'\np\.\n', r'\np. \n', text)
+        text = re.sub(r' \n\n\n', r' \n\n', text)                          #reduce blank lines
+        
         # started work on trying to fix footnotes
 #        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
         return text
@@ -110,21 +121,15 @@ class TextileMLizer(OEB2HTML):
             self.remove_space_after_newline = False
         return text
 
-#    def remove_leading_ws(self, text):
-#        text = text.replace('\r\n', '\n')
-#        text = text.replace('\r', '\n')
-#        text = re.sub(r'\n[\t ]+', '\n', text)
-#        text = re.sub(r'\n{2,}', '\n', text)
-#        return text
-
     def check_styles(self, style):
         txt = '{'
-#        style_string = '%s;' % style
-#        txt += style_string
         if style['color'] and style['color'] != 'black':
             txt += 'color:'+style['color']+';'
-#        if style['font-size']:# in ('big', 'bigger', 'small', 'smaller'):
-#            txt += 'font-size: %d;' % style['font-size']
+        try:
+            if style['background']:
+                txt += 'background:'+style['background']+';'
+        except:
+            pass
         txt += '}'
         if txt == '{}': txt = ''
         return txt
@@ -137,7 +142,7 @@ class TextileMLizer(OEB2HTML):
         return ''
 
     def check_valign(self, style):
-        tests = {'top':'^','bottom':'~', 'middle':'-'}
+        tests = {'top':'^','bottom':'~'} #, 'middle':'-'}
         for i in tests:
             if style['vertical-align'] == i:
                 return tests[i]
@@ -157,8 +162,9 @@ class TextileMLizer(OEB2HTML):
     def check_id_tag(self, attribs):
         txt = ''
         if attribs.has_key('id'): # and attribs['id'] in self.links.values():
-                txt = '(#'+attribs['id']+ ')'
-                self.our_ids.append('#'+attribs['id'])
+            txt = '(#'+attribs['id']+ ')'
+            self.our_ids.append('#'+attribs['id'])
+            self.id_no_text = u'\xa0'
         return txt
 
     def build_block(self, tag, style, attribs):
@@ -170,7 +176,7 @@ class TextileMLizer(OEB2HTML):
         txt += self.check_styles(style)
         return txt
 
-    def dump_text(self, elem, stylizer, page, tag_stack=[]):
+    def dump_text(self, elem, stylizer):
         '''
         @elem: The element in the etree that we are working on.
         @stylizer: The style information attached to the element.
@@ -197,45 +203,59 @@ class TextileMLizer(OEB2HTML):
            or style['visibility'] == 'hidden':
             return ['']
 
+        # Soft scene breaks.
+        text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0']))
+
         if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
             if tag == 'div':
                 tag = 'p'
-            text.append(self.build_block(tag, style, attribs))
-            text.append('. ')
-            tags.append('\n')
+            block = self.build_block(tag, style, attribs)
+            # Normal paragraph with no styling.
+            if block == '\np':
+                text.append('\n\n')
+                tags.append('\n')
+            else:
+                text.append(block)
+                text.append('. ')
+                tags.append('\n')
+            #self.style_embed = []
 
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                 if self.style_italic == False:
                     text.append('_')
-#                    text.append('from '+tag)
                     tags.append('_')
+                    self.style_embed.append ('_')
                     self.style_italic = True
         if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
-                style_string = '%s;' % style
-                text.append(style_string)
                 if self.style_bold == False:
                     text.append('*')
-#                    text.append('from '+tag)
                     tags.append('*')
+                    self.style_embed.append ('*')
                     self.style_bold = True
         if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
             if tag != 'a':
                 if self.style_under == False:
                     text.append('+')
                     tags.append('+')
+                    self.style_embed.append ('+')
                     self.style_under = True
         if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
             if self.style_strike == False:
                 text.append('-')
                 tags.append('-')
+                self.style_embed.append ('-')
                 self.style_strike = True
         if tag == 'br':
-            text.append('')
-            tags.append('\n')
+            for i in reversed(self.style_embed):
+                text.append(i)
+            text.append('\n')
+            for i in self.style_embed:
+                text.append(i)
+            tags.append('')
             self.remove_space_after_newline = True
-        elif tag == 'blockquote':
+        if tag == 'blockquote':
             text.append('\nbq. ')
             tags.append('\n')
         elif tag in ('abbr', 'acronym'):
@@ -259,7 +279,7 @@ class TextileMLizer(OEB2HTML):
             text.append('??')
             tags.append('??')
         elif tag == 'hr':
-            text.append('\n***\n')
+            text.append('\n***')
             tags.append('\n')
         elif tag == 'pre':
             self.in_pre = True
@@ -267,12 +287,14 @@ class TextileMLizer(OEB2HTML):
             tags.append('pre\n')
         elif tag == 'a':
             if self.opts.keep_links:
-                text.append ('"')
+                text.append('"')
+                tags.append('a')
                 if attribs.has_key('href'):
                     tags.append('":' + attribs['href'])
                     self.our_links.append(attribs['href'])
                 if attribs.has_key('title'):
                     tags.append('(' + attribs['title'] + ')')
+                self.in_a_link = True
         elif tag == 'img':
             if self.opts.keep_image_references:
                 txt = '!' + self.check_halign(style)
@@ -286,7 +308,7 @@ class TextileMLizer(OEB2HTML):
                 tags.append('!')
         elif tag in ('ol', 'ul'):
             self.list.append({'name':tag, 'num':0})
-            text.append('\n')
+            text.append('')
             tags.append(tag)
         elif tag == 'li':
             if self.list: li = self.list[-1]
@@ -294,7 +316,7 @@ class TextileMLizer(OEB2HTML):
             text.append('\n')
             if   li['name'] == 'ul': text.append('*'*len(self.list)+' ')
             elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
-            tags.append('\n')
+            tags.append('')
         elif tag == 'dl':
             text.append('\n')
             tags.append('')
@@ -308,12 +330,19 @@ class TextileMLizer(OEB2HTML):
             text.append('')
             tags.append('\n')
         elif tag == 'table':
-            self.in_table = True
-            text.append('')
+            txt = self.build_block(tag, style, attribs)
+            txt += '. \n'
+            if txt != '\ntable. \n':
+                text.append(txt)
+            else:
+                text.append('\n')
             tags.append('')
-            tags.append('table')
         elif tag == 'tr':
-            text.append('')
+            txt = self.build_block('', style, attribs)
+            txt += '. '
+            if txt != '\n. ':
+                txt = re.sub ('\n','',txt)
+                text.append(txt)
             tags.append('|\n')
         elif tag == 'td':
             text.append('|')
@@ -324,13 +353,15 @@ class TextileMLizer(OEB2HTML):
                 txt += '\\' + attribs['colspan']
             if attribs.has_key ('rowspan'):
                 txt += '/' + attribs['rowspan']
+            try:
+                txt += self.check_styles(style)
+            except:
+                pass
             if txt != '':
                 text.append(txt+'. ')
             tags.append('')
         elif tag == 'th':
-            text.append('|_')
-            
-            text.append('. ')
+            text.append('|_. ')
             tags.append('')
         elif tag == 'span':
             if style['font-variant'] == 'small-caps':
@@ -339,35 +370,36 @@ class TextileMLizer(OEB2HTML):
                     tags.append('&')
                     self.style_smallcap = True
             else:
-                txt = '%'
-                if self.opts.keep_links:
-                    txt += self.check_id_tag(attribs)
-                    txt += self.check_styles(style)
-                if txt != '%':
-                    text.append(txt)
-                    tags.append('%')
+                if self.in_a_link == False:
+                    txt = '%'
+                    if self.opts.keep_links:
+                        txt += self.check_id_tag(attribs)
+                        txt += self.check_styles(style)
+                    if txt != '%':
+                        text.append(txt)
+                        tags.append('%')
 
         if self.opts.keep_links and attribs.has_key('id'):
-            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span'):
+            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
                 text.append(self.check_id_tag(attribs))
 
         # Process the styles for any that we want to keep
-        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span'):
-            text.append(self.check_styles(style))
+        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
+                'span', 'table', 'tr', 'td'):
+            if not self.in_a_link:
+                text.append(self.check_styles(style))
         
         # Process tags that contain text.
         if hasattr(elem, 'text') and elem.text:
             txt = elem.text
             if not self.in_pre:
-                if self.in_table:
-                    txt = self.remove_newlines(txt)
-                else:
-                    txt = self.remove_leading_ws(txt)
+                txt = self.remove_newlines(txt)
             text.append(txt)
+            self.id_no_text = u''
 
         # Recurse down into tags within the tag we are in.
         for item in elem:
-            text += self.dump_text(item, stylizer, page, tag_stack+tags)
+            text += self.dump_text(item, stylizer)
 
         # Close all open tags.
         tags.reverse()
@@ -375,36 +407,39 @@ class TextileMLizer(OEB2HTML):
             if tag in ('pre', 'ul', 'ol', 'li', 'table'):
                 if tag == 'pre':
                     self.in_pre = False
-                if tag == 'table':
-                    self.in_table = False
-                if tag in ('ul', 'ol'):
+                elif tag in ('ul', 'ol'):
                     if self.list: self.list.pop()
                     if not self.list: text.append('\n')
             else:
-                text.append('%s' % t)
-                if t == '*': self.style_bold = False
-                if t == '_': self.style_italic = False
-                if t == '+': self.style_under = False
-                if t == '-': self.style_strike = False
-                if t == '&': self.style_smallcap = False
+                if t == 'a':
+                    self.in_a_link = False
+                    t = ''
+                text.append(self.id_no_text)
+                self.id_no_text = u''
+                if t == '*':
+                    self.style_bold = False
+                elif t == '_':
+                    self.style_italic = False
+                elif t == '+':
+                    self.style_under = False
+                elif t == '-':
+                    self.style_strike = False
+                elif t == '&':
+                    self.style_smallcap = False
+                if t in ('*', '_', '+', '-'):
+                    txt = self.style_embed.pop()
+                    text.append(txt)
+                else:
+                    text.append('%s' % t)
 
         # Soft scene breaks.
         text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0']))
-#        try:
-#            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
-#            if ems >= 1:
-#                text.append('\n' * ems)
-#        except:
-#            pass
 
         # Add the text that is outside of the tag.
         if hasattr(elem, 'tail') and elem.tail:
             tail = elem.tail
             if not self.in_pre:
-                if self.in_table:
-                    tail = self.remove_newlines(tail)
-                else:
-                    tail = self.remove_leading_ws(tail)
+                tail = self.remove_newlines(tail)
             text.append(tail)
 
         return text

From 8853f6c1468bebd72e360517c4117a3764f9edfe Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 1 May 2011 10:24:56 -0400
Subject: [PATCH 07/25] ...

---
 src/calibre/ebooks/txt/output.py    | 14 ++++++++++++--
 src/calibre/ebooks/txt/textileml.py | 25 ++++++++++++-------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 261ace2f91..d9c42eb1dc 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -66,6 +66,13 @@ class TXTOutput(OutputFormatPlugin):
             help=_('Do not remove image references within the document. This is only ' \
             'useful when paired with a txt-output-formatting option that '
             'is not none because links are always removed with plain text output.')),
+        OptionRecommendation(name='keep_color',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Do not remove font color from output. This is only useful when ' \
+                   'txt-output-formatting is set to textile. Textile is the only ' \
+                   'formatting that supports setting font color. If this option is ' \
+                   'not specified font color will not be set and default to the ' \
+                   'color displayed by the reader (generally this is black).')),
      ])
 
     def convert(self, oeb_book, output_path, input_plugin, opts, log):
@@ -111,9 +118,12 @@ class TXTZOutput(TXTOutput):
         from calibre.ebooks.oeb.base import OEB_IMAGES
         with TemporaryDirectory('_txtz_output') as tdir:
             # TXT
-            with TemporaryFile('index.txt') as tf:
+            txt_name = 'index.txt'
+            if opts.txt_output_formatting.lower() == 'textile':
+                txt_name = 'index.text'
+            with TemporaryFile(txt_name) as tf:
                 TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
-                shutil.copy(tf, os.path.join(tdir, 'index.txt'))
+                shutil.copy(tf, os.path.join(tdir, txt_name))
 
             # Images
             for item in oeb_book.manifest:
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 622ff8d2e3..1c35670596 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -98,7 +98,7 @@ class TextileMLizer(OEB2HTML):
         text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
         text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
-        text = re.sub(u'\np.*\.\xa0',   r'\np. ', text)                # blank paragraph
+        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
         text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
         text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
         # Now put back spaces removed earlier as they're needed here
@@ -176,6 +176,11 @@ class TextileMLizer(OEB2HTML):
         txt += self.check_styles(style)
         return txt
 
+    def prepare_string_for_textile(self, txt):
+        if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt):
+            return ' ==%s== ' % txt
+        return txt
+
     def dump_text(self, elem, stylizer):
         '''
         @elem: The element in the etree that we are working on.
@@ -197,7 +202,7 @@ class TextileMLizer(OEB2HTML):
         tags = []
         tag = barename(elem.tag)
         attribs = elem.attrib
-        
+
         # Ignore anything that is set to not be displayed.
         if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
            or style['visibility'] == 'hidden':
@@ -209,15 +214,9 @@ class TextileMLizer(OEB2HTML):
         if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
             if tag == 'div':
                 tag = 'p'
-            block = self.build_block(tag, style, attribs)
-            # Normal paragraph with no styling.
-            if block == '\np':
-                text.append('\n\n')
-                tags.append('\n')
-            else:
-                text.append(block)
-                text.append('. ')
-                tags.append('\n')
+            text.append(self.build_block(tag, style, attribs))
+            text.append('. ')
+            tags.append('\n')
             #self.style_embed = []
 
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
@@ -393,7 +392,7 @@ class TextileMLizer(OEB2HTML):
         if hasattr(elem, 'text') and elem.text:
             txt = elem.text
             if not self.in_pre:
-                txt = self.remove_newlines(txt)
+                txt = self.prepare_string_for_textile(self.remove_newlines(txt))
             text.append(txt)
             self.id_no_text = u''
 
@@ -439,7 +438,7 @@ class TextileMLizer(OEB2HTML):
         if hasattr(elem, 'tail') and elem.tail:
             tail = elem.tail
             if not self.in_pre:
-                tail = self.remove_newlines(tail)
+                tail = self.prepare_string_for_textile(self.remove_newlines(tail))
             text.append(tail)
 
         return text

From 4bdbab22ca6e8818b76e0ae98ec30094dd00622d Mon Sep 17 00:00:00 2001
From: Li Fanxi <lifanxi@freemindworld.com>
Date: Sun, 8 May 2011 22:28:47 +0800
Subject: [PATCH 08/25] Finish the Douban.com books metadata source plugin

---
 src/calibre/ebooks/metadata/sources/douban.py | 55 +++++++++----------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py
index 8f1794b33f..7a8619261b 100644
--- a/src/calibre/ebooks/metadata/sources/douban.py
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@@ -40,8 +40,8 @@ publisher      = XPath("descendant::db:attribute[@name='publisher']")
 isbn           = XPath("descendant::db:attribute[@name='isbn13']")
 date           = XPath("descendant::db:attribute[@name='pubdate']")
 creator        = XPath("descendant::db:attribute[@name='author']")
-tag            = XPath("descendant::db:tag")
-rating         = XPath("descendant::gd:rating[@name='average']")
+booktag        = XPath("descendant::db:tag/attribute::name")
+rating         = XPath("descendant::gd:rating/attribute::average")
 cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")
 
 def get_details(browser, url, timeout): # {{{
@@ -51,7 +51,7 @@ def get_details(browser, url, timeout): # {{{
         gc = getattr(e, 'getcode', lambda : -1)
         if gc() != 403:
             raise
-        # Google is throttling us, wait a little
+        # Douban is throttling us, wait a little
         time.sleep(2)
         raw = browser.open_novisit(url, timeout=timeout).read()
 
@@ -59,7 +59,6 @@ def get_details(browser, url, timeout): # {{{
 # }}}
 
 def to_metadata(browser, log, entry_, timeout): # {{{
-
     def get_text(extra, x):
         try:
             ans = x(extra)
@@ -71,7 +70,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{
             log.exception('Programming error:')
         return None
 
-
     id_url = entry_id(entry_)[0].text
     douban_id = id_url.split('/')[-1]
     title_ = ': '.join([x.text for x in title(entry_)]).strip()
@@ -92,9 +90,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
     except:
         log.exception('Failed to get additional details for', mi.title)
         return mi
-
     mi.comments = get_text(extra, description)
-    #mi.language = get_text(extra, language)
     mi.publisher = get_text(extra, publisher)
 
     # ISBN
@@ -108,7 +104,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
 
     # Tags
     try:
-        btags = [x.text for x in subject(extra) if x.text]
+        btags = [x for x in booktag(extra) if x]
         tags = []
         for t in btags:
             atags = [y.strip() for y in t.split('/')]
@@ -120,7 +116,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
         tags = []
     if tags:
         mi.tags = [x.replace(',', ';') for x in tags]
-
+        
     # pubdate
     pubdate = get_text(extra, date)
     if pubdate:
@@ -133,7 +129,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
     # Ratings
     if rating(extra):
         try:
-            mi.rating = float(rating(extra).text) / 2.0
+            mi.rating = float(rating(extra)[0]) / 2.0
         except:
             log.exception('Failed to parse rating')
             mi.rating = 0
@@ -141,10 +137,8 @@ def to_metadata(browser, log, entry_, timeout): # {{{
     # Cover
     mi.has_douban_cover = None
     u = cover_url(extra)
-    print(u)
     if u:
         u = u[0].replace('/spic/', '/lpic/');
-        print(u)
         # If URL contains "book-default", the book doesn't have a cover
         if u.find('book-default') == -1:
             mi.has_douban_cover = u
@@ -155,26 +149,24 @@ class Douban(Source):
 
     name = 'Douban Books'
     author = _('Li Fanxi')
+    version = (2, 0, 0)
     
     description = _('Downloads metadata from Douban.com')
 
     capabilities = frozenset(['identify', 'cover'])
     touched_fields = frozenset(['title', 'authors', 'tags', 
-        'comments', 'publisher', 'identifier:isbn', 'rating',
+        'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
         'identifier:douban']) # language currently disabled
     supports_gzip_transfer_encoding = True
     cached_cover_url_is_reliable = True
 
     DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
-    DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s'
-#    GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
-
-#    DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
+    DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
 
     def get_book_url(self, identifiers): # {{{
         db = identifiers.get('douban', None)
         if db is not None:
-            return DOUBAN_ID_URL % db
+            return ('douban', db, self.DOUBAN_BOOK_URL%db)
         else:
             return None
     # }}}
@@ -182,13 +174,18 @@ class Douban(Source):
     def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
         SEARCH_URL = 'http://api.douban.com/book/subjects?'
         ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
+        SUBJECT_URL = 'http://api.douban.com/book/subject/'
 
         q = ''
         t = None
         isbn = check_isbn(identifiers.get('isbn', None))
+        subject = identifiers.get('douban', None)
         if isbn is not None:
             q = isbn
             t = 'isbn'
+        elif subject is not None:
+            q = subject
+            t = 'subject'
         elif title or authors:
             def build_term(prefix, parts):
                 return ' '.join(x for x in parts)
@@ -209,6 +206,8 @@ class Douban(Source):
         url = None
         if t == "isbn":
             url = ISBN_URL + q
+        elif t == 'subject':
+            url = SUBJECT_URL + q
         else:
             url = SEARCH_URL + urlencode({
                     'q': q,
@@ -314,14 +313,12 @@ class Douban(Source):
         except Exception as e:
             log.exception('Failed to parse identify results')
             return as_unicode(e)
-        if not title:
-            title = ""
         if not entries and identifiers and title and authors and \
                 not abort.is_set():
             return self.identify(log, result_queue, abort, title=title,
                     authors=authors, timeout=timeout)
 
-        # There is no point running these queries in threads as google
+        # There is no point running these queries in threads as douban
         # throttles requests returning 403 Forbidden errors
         self.get_all_details(br, log, entries, abort, result_queue, timeout)
 
@@ -329,23 +326,23 @@ class Douban(Source):
     # }}}
 
 if __name__ == '__main__': # tests {{{
-    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
+    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
     from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
             title_test, authors_test)
-    test_identify_plugin(GoogleBooks.name,
+    test_identify_plugin(Douban.name,
         [
 
 
             (
-                {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby',
-                    'authors':['Fitzgerald']},
-                [title_test('The great gatsby', exact=True),
-                    authors_test(['Francis Scott Fitzgerald'])]
+                {'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
+                    'authors':['刘慈欣']},
+                [title_test('三体', exact=True),
+                    authors_test(['刘慈欣'])]
             ),
 
             (
-                {'title': 'Flatland', 'authors':['Abbott']},
-                [title_test('Flatland', exact=False)]
+                {'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
+                [title_test('Linux内核修炼之道', exact=False)]
             ),
     ])
 # }}}

From 803c0449b9b2d7e479658e03f555c215eacad026 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 8 May 2011 16:01:36 -0400
Subject: [PATCH 09/25] ...

---
 src/calibre/customize/profiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
index 5c29f1e79b..de82aaffa1 100644
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@@ -253,7 +253,7 @@ class OutputProfile(Plugin):
     periodical_date_in_title = True
 
     #: Characters used in jackets and catalogs
-	missing_char = u'x'
+    missing_char = u'x'
     ratings_char = u'*'
     empty_ratings_char = u' '
     read_char = u'+'

From 5ac915b416c49189606311a2524d59d5a3f8feeb Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 8 May 2011 16:39:45 -0400
Subject: [PATCH 10/25] Leigh's latest changes.

---
 src/calibre/ebooks/txt/textileml.py | 106 ++++++++++++++++------------
 1 file changed, 61 insertions(+), 45 deletions(-)

diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 1c35670596..2f04c4676b 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -14,6 +14,7 @@ from functools import partial
 from calibre.ebooks.htmlz.oeb2html import OEB2HTML
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
 from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.ebooks import unit_convert
 from calibre.ebooks.txt.unsmarten import unsmarten
 
 class TextileMLizer(OEB2HTML):
@@ -55,20 +56,19 @@ class TextileMLizer(OEB2HTML):
             self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
             self.rewrite_ids(item.data, item)
             rewrite_links(item.data, partial(self.rewrite_link, page=item))
-            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
+            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
             output += self.dump_text(item.data.find(XHTML('body')), stylizer)
             output.append('\n\n')
         return ''.join(output)
 
     def tidy_up(self, text):
-        # Needs tweaking and finetuning
+        # May need tweaking and finetuning
         def check_escaping(text, tests):
             for t in tests:
                 # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
                 txt = '%s' % t
-                self.log.debug('DEBUG: ' + txt)
                 if txt != '%':
-                    text = re.sub(r'(\S)'+t+t+'(\S)', r'\1\2', text)
+                    text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
                 text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
                 text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
             return text
@@ -87,26 +87,26 @@ class TextileMLizer(OEB2HTML):
         text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
 
         text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
-        text = re.sub(r'%%', r'', text)                                 #remove empty spans
+        text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
         text = re.sub(r'%([_+*-]+)%', r'\1', text)                      #remove spans from tagged output
         text = re.sub(r' +\n', r'\n', text)                             #remove spaces before a newline
         text = re.sub(r'^\n+', r'', text)                               #remove newlines at top of file
         text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)              #correct blockcode paras
         text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)            #correct blockquote paras
-#        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
         text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
         text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
-        text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
         text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
         text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
+        text = re.sub(u'\np[<>=]{0,2}\. \xa0',   r'\np. ', text)       # blank paragraph
+        text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
+        text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text)
         text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
+
         # Now put back spaces removed earlier as they're needed here
         text = re.sub(r'\np\.\n', r'\np. \n', text)
         text = re.sub(r' \n\n\n', r' \n\n', text)                          #reduce blank lines
-        
-        # started work on trying to fix footnotes
-#        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
+
         return text
 
     def remove_newlines(self, text):
@@ -123,13 +123,11 @@ class TextileMLizer(OEB2HTML):
 
     def check_styles(self, style):
         txt = '{'
-        if style['color'] and style['color'] != 'black':
-            txt += 'color:'+style['color']+';'
-        try:
-            if style['background']:
+        if self.opts.keep_color:
+            if 'color' in style.cssdict() and style['color'] != 'black':
+                txt += 'color:'+style['color']+';'
+            if 'background' in style.cssdict():
                 txt += 'background:'+style['background']+';'
-        except:
-            pass
         txt += '}'
         if txt == '{}': txt = ''
         return txt
@@ -148,30 +146,44 @@ class TextileMLizer(OEB2HTML):
                 return tests[i]
         return ''
 
-    def check_padding(self, style, tests):
+    def check_padding(self, style, stylizer):
         txt = ''
-        for i in tests:
-            try:
-                ems = int(round(float(style[i[0]] / style['font-size'])))
-                if ems >=1:
-                    txt += i[1] * ems
-            except:
-                pass
+        left_padding_pts = 0
+        left_margin_pts = 0
+        if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
+            left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
+        if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
+            left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
+        left = left_margin_pts + left_padding_pts
+        emleft = int(round(left / stylizer.profile.fbase))
+        if emleft >= 1:
+            txt += '(' * emleft
+        right_padding_pts = 0
+        right_margin_pts = 0
+        if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
+            right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
+        if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
+            right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
+        right = right_margin_pts + right_padding_pts
+        emright = int(round(right / stylizer.profile.fbase))
+        if emright >= 1:
+            txt += ')' * emright
+            
         return txt
 
     def check_id_tag(self, attribs):
         txt = ''
-        if attribs.has_key('id'): # and attribs['id'] in self.links.values():
+        if attribs.has_key('id'):
             txt = '(#'+attribs['id']+ ')'
             self.our_ids.append('#'+attribs['id'])
             self.id_no_text = u'\xa0'
         return txt
 
-    def build_block(self, tag, style, attribs):
+    def build_block(self, tag, style, attribs, stylizer):
         txt = '\n' + tag
         if self.opts.keep_links:
             txt += self.check_id_tag(attribs)
-        txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
+        txt += self.check_padding(style, stylizer)
         txt += self.check_halign(style)
         txt += self.check_styles(style)
         return txt
@@ -202,22 +214,24 @@ class TextileMLizer(OEB2HTML):
         tags = []
         tag = barename(elem.tag)
         attribs = elem.attrib
-
+        
         # Ignore anything that is set to not be displayed.
         if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
            or style['visibility'] == 'hidden':
             return ['']
 
         # Soft scene breaks.
-        text.append(self.check_padding(style, ['margin-top',u'\n\n\xa0']))
-
+        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
+            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
+            if ems >= 1:
+                text.append(u'\n\n\xa0' * ems)
+            
         if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
             if tag == 'div':
                 tag = 'p'
-            text.append(self.build_block(tag, style, attribs))
+            text.append(self.build_block(tag, style, attribs, stylizer))
             text.append('. ')
             tags.append('\n')
-            #self.style_embed = []
 
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
@@ -306,15 +320,17 @@ class TextileMLizer(OEB2HTML):
                         text.append('(' + txt + ')')
                 tags.append('!')
         elif tag in ('ol', 'ul'):
-            self.list.append({'name':tag, 'num':0})
+            self.list.append({'name': tag, 'num': 0})
             text.append('')
             tags.append(tag)
         elif tag == 'li':
             if self.list: li = self.list[-1]
-            else: li = {'name':'ul', 'num':0}
+            else: li = {'name': 'ul', 'num': 0}
             text.append('\n')
-            if   li['name'] == 'ul': text.append('*'*len(self.list)+' ')
-            elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
+            if li['name'] == 'ul':
+                text.append('*' * len(self.list) + ' ')
+            elif li['name'] == 'ol':
+                text.append('#' * len(self.list) + ' ')
             tags.append('')
         elif tag == 'dl':
             text.append('\n')
@@ -329,7 +345,7 @@ class TextileMLizer(OEB2HTML):
             text.append('')
             tags.append('\n')
         elif tag == 'table':
-            txt = self.build_block(tag, style, attribs)
+            txt = self.build_block(tag, style, attribs, stylizer)
             txt += '. \n'
             if txt != '\ntable. \n':
                 text.append(txt)
@@ -337,10 +353,10 @@ class TextileMLizer(OEB2HTML):
                 text.append('\n')
             tags.append('')
         elif tag == 'tr':
-            txt = self.build_block('', style, attribs)
+            txt = self.build_block('', style, attribs, stylizer)
             txt += '. '
             if txt != '\n. ':
-                txt = re.sub ('\n','',txt)
+                txt = re.sub ('\n', '', txt)
                 text.append(txt)
             tags.append('|\n')
         elif tag == 'td':
@@ -352,12 +368,9 @@ class TextileMLizer(OEB2HTML):
                 txt += '\\' + attribs['colspan']
             if attribs.has_key ('rowspan'):
                 txt += '/' + attribs['rowspan']
-            try:
-                txt += self.check_styles(style)
-            except:
-                pass
+            txt += self.check_styles(style)
             if txt != '':
-                text.append(txt+'. ')
+                text.append(txt + '. ')
             tags.append('')
         elif tag == 'th':
             text.append('|_. ')
@@ -432,7 +445,10 @@ class TextileMLizer(OEB2HTML):
                     text.append('%s' % t)
 
         # Soft scene breaks.
-        text.append(self.check_padding(style, ['margin-bottom',u'\n\n\xa0']))
+        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
+            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
+            if ems >=1:
+                text.append(u'\n\n\xa0' * ems)
 
         # Add the text that is outside of the tag.
         if hasattr(elem, 'tail') and elem.tail:

From d6ec680ebbbadc659a09105d66aaa60299ac1be9 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 9 May 2011 06:43:19 -0400
Subject: [PATCH 11/25] Leigh's latest changes.

---
 src/calibre/ebooks/textile/functions.py | 2 +-
 src/calibre/ebooks/txt/textileml.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py
index dd1914cf9f..b186e79ad4 100755
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@@ -792,6 +792,7 @@ class Textile(object):
             text = self.noTextile(text)
             text = self.code(text)
 
+        text = self.glyphs(text)
         text = self.links(text)
 
         if not self.noimage:
@@ -803,7 +804,6 @@ class Textile(object):
 
         text = self.span(text)
         text = self.footnoteRef(text)
-        text = self.glyphs(text)
 
         return text.rstrip('\n')
 
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 2f04c4676b..082332ffd8 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -69,8 +69,8 @@ class TextileMLizer(OEB2HTML):
                 txt = '%s' % t
                 if txt != '%':
                     text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
-                text = re.sub(r'(\w)('+t+'\w+'+t+')', r'\1[\2]', text)
-                text = re.sub(r'('+t+'\w+'+t+')(\w)', r'[\1]\2', text)
+                text = re.sub(r'([a-zA-Z0-9\'"\-])('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')', r'\1[\2]', text)
+                text = re.sub(r'('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')([a-zA-Z0-9\'"!?\-])', r'[\1]\2', text)
             return text
 
         # Now tidyup links and ids - remove ones that don't have a correponding opposite

From c384188057639b42e5e10c142f6e1425f94d09ba Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 9 May 2011 06:47:45 -0400
Subject: [PATCH 12/25] Leigh's latest changes.

---
 src/calibre/ebooks/txt/textileml.py | 58 +++++++++++++++--------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 082332ffd8..31c118251d 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -68,9 +68,8 @@ class TextileMLizer(OEB2HTML):
                 # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
                 txt = '%s' % t
                 if txt != '%':
-                    text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
-                text = re.sub(r'([a-zA-Z0-9\'"\-])('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')', r'\1[\2]', text)
-                text = re.sub(r'('+t+'[a-zA-Z0-9\'"!? ,.\-]+'+t+')([a-zA-Z0-9\'"!?\-])', r'[\1]\2', text)
+                    text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
+                text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text)
             return text
 
         # Now tidyup links and ids - remove ones that don't have a correponding opposite
@@ -84,7 +83,8 @@ class TextileMLizer(OEB2HTML):
                     text = re.sub(r'\('+i+'\)', '', text)
                     
         # Note - I'm not checking for escaped '-' as this will also get hypenated words
-        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '%'])
+        text = check_escaping(text, ['\*', '_', '\+', '-'])
+#        text = check_escaping(text, ['\*', '_', '\+', '-'])
 
         text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
         text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
@@ -93,20 +93,24 @@ class TextileMLizer(OEB2HTML):
         text = re.sub(r'^\n+', r'', text)                               #remove newlines at top of file
         text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)              #correct blockcode paras
         text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)            #correct blockquote paras
+#        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
         text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
+#        text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
         text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
         text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
         text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
-        text = re.sub(u'\np[<>=]{0,2}\. \xa0',   r'\np. ', text)       # blank paragraph
-        text = re.sub(r'\n(p.*\.\n)(p.*\.)', r'\n\2', text)
+        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)       # blank paragraph
+        text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
         text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text)
         text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
 
         # Now put back spaces removed earlier as they're needed here
         text = re.sub(r'\np\.\n', r'\np. \n', text)
         text = re.sub(r' \n\n\n', r' \n\n', text)                          #reduce blank lines
-
+        
+        # started work on trying to fix footnotes
+#        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
         return text
 
     def remove_newlines(self, text):
@@ -236,29 +240,29 @@ class TextileMLizer(OEB2HTML):
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                 if self.style_italic == False:
-                    text.append('_')
-                    tags.append('_')
-                    self.style_embed.append ('_')
+                    text.append('[_')
+                    tags.append('_]')
+                    self.style_embed.append('_')
                     self.style_italic = True
         if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                 if self.style_bold == False:
-                    text.append('*')
-                    tags.append('*')
-                    self.style_embed.append ('*')
+                    text.append('[*')
+                    tags.append('*]')
+                    self.style_embed.append('*')
                     self.style_bold = True
         if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
             if tag != 'a':
                 if self.style_under == False:
-                    text.append('+')
-                    tags.append('+')
-                    self.style_embed.append ('+')
+                    text.append('[+')
+                    tags.append('+]')
+                    self.style_embed.append('+')
                     self.style_under = True
         if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
             if self.style_strike == False:
-                text.append('-')
-                tags.append('-')
-                self.style_embed.append ('-')
+                text.append('[-')
+                tags.append('-]')
+                self.style_embed.append('-')
                 self.style_strike = True
         if tag == 'br':
             for i in reversed(self.style_embed):
@@ -428,26 +432,24 @@ class TextileMLizer(OEB2HTML):
                     t = ''
                 text.append(self.id_no_text)
                 self.id_no_text = u''
-                if t == '*':
+                if t == '*]':
                     self.style_bold = False
-                elif t == '_':
+                elif t == '_]':
                     self.style_italic = False
-                elif t == '+':
+                elif t == '+]':
                     self.style_under = False
-                elif t == '-':
+                elif t == '-]':
                     self.style_strike = False
                 elif t == '&':
                     self.style_smallcap = False
-                if t in ('*', '_', '+', '-'):
+                if t in ('*]', '_]', '+]', '-]'):
                     txt = self.style_embed.pop()
-                    text.append(txt)
-                else:
-                    text.append('%s' % t)
+                text.append('%s' % t)
 
         # Soft scene breaks.
         if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
             ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
-            if ems >=1:
+            if ems >= 1:
                 text.append(u'\n\n\xa0' * ems)
 
         # Add the text that is outside of the tag.

From 842ba755575c108fc0c8ab93cac383185776f212 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 9 May 2011 21:19:28 -0400
Subject: [PATCH 13/25] More changes.

---
 src/calibre/ebooks/textile/functions.py | 34 +++++++++-----
 src/calibre/ebooks/txt/textileml.py     | 59 ++++++++++++++++---------
 2 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py
index b186e79ad4..0e1811f195 100755
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@@ -12,7 +12,7 @@ A Humane Web Text Generator
 #__date__ = '2009/12/04'
 
 __copyright__ = """
-Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
+Copyright (c) 2011, Leigh Parry
 Copyright (c) 2011, John Schember <john@nachtimwald.com>
 Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
 Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
@@ -219,14 +219,13 @@ class Textile(object):
     ]
     glyph_defaults = [
         (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'),                   r'\1\2&#215;\3'),                       #  dimension sign
-        (re.compile(r'(\d+)\'', re.I),                                 r'\1&#8242;'),                          #  prime
-        (re.compile(r'(\d+)\"', re.I),                                 r'\1&#8243;'),                          #  prime-double
+        (re.compile(r'(\d+)\'(\s)', re.I),                             r'\1&#8242;\2'),                          #  prime
+        (re.compile(r'(\d+)\"(\s)', re.I),                             r'\1&#8243;\2'),                          #  prime-double
         (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'),      r'<acronym title="\2">\1</acronym>'),   #  3+ uppercase acronym
         (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'),         r'<span class="caps">\1</span>'),       #  3+ uppercase
         (re.compile(r'\b(\s{0,1})?\.{3}'),                             r'\1&#8230;'),                          #  ellipsis
         (re.compile(r'^[\*_-]{3,}$', re.M),                            r'<hr />'),                             #  <hr> scene-break
-#        (re.compile(r'\b--\b'),                                        r'&#8212;'),                            #  em dash
-        (re.compile(r'([^-])--([^-])'),                                r'\1&#8212;\2'),                        #  em dash
+        (re.compile(r'(^|[^-])--([^-]|$)'),                                r'\1&#8212;\2'),                        #  em dash
         (re.compile(r'\s-(?:\s|$)'),                                   r' &#8211; '),                          #  en dash
         (re.compile(r'\b( ?)[([]TM[])]', re.I),                        r'\1&#8482;'),                          #  trademark
         (re.compile(r'\b( ?)[([]R[])]', re.I),                         r'\1&#174;'),                           #  registered
@@ -706,6 +705,21 @@ class Textile(object):
             result.append(line)
         return ''.join(result)
 
+    def glyphs_only(self, text):
+        # fix: hackish
+        text = re.sub(r'"\Z', '\" ', text)
+
+        result = []
+        for line in re.compile(r'(<.*?>)', re.U).split(text):
+            if not re.search(r'<.*>', line):
+                rules = []
+                if re.search(r'{.+?}', line):
+                    rules = self.macro_defaults
+                for s, r in rules:
+                    line = s.sub(r, line)
+            result.append(line)
+        return ''.join(result)
+
     def vAlign(self, input):
         d = {'^':'top', '-':'middle', '~':'bottom'}
         return d.get(input, '')
@@ -792,7 +806,6 @@ class Textile(object):
             text = self.noTextile(text)
             text = self.code(text)
 
-        text = self.glyphs(text)
         text = self.links(text)
 
         if not self.noimage:
@@ -804,6 +817,7 @@ class Textile(object):
 
         text = self.span(text)
         text = self.footnoteRef(text)
+        text = self.glyphs(text)
 
         return text.rstrip('\n')
 
@@ -814,6 +828,7 @@ class Textile(object):
         'fooobar ... and hello world ...'
         """
 
+        text = self.glyphs_only(text)
         punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
 
         pattern = r'''
@@ -868,7 +883,7 @@ class Textile(object):
         >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
         'hello <span class="bob">span <strong>strong</strong> and <b>bold</b></span> goodbye'
         """
-        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&')
+        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
         pnct = ".,\"'?!;:"
 
         for qtag in qtags:
@@ -900,9 +915,7 @@ class Textile(object):
             '%' : 'span',
             '+' : 'ins',
             '~' : 'sub',
-            '^' : 'sup',
-            '&' : 'span style="font-variant:small-caps;"'
-#            '&' : 'span style="font-transform:uppercase;font-size:smaller;"'
+            '^' : 'sup'
         }
         tag = qtags[tag]
         atts = self.pba(atts)
@@ -1046,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
     return Textile(restricted=True, lite=lite,
                    noimage=noimage).textile(text, rel='nofollow',
                                             html_type=html_type)
-
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 31c118251d..814ba01a3e 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -69,7 +69,8 @@ class TextileMLizer(OEB2HTML):
                 txt = '%s' % t
                 if txt != '%':
                     text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
-                text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text)
+                    text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
+                text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
             return text
 
         # Now tidyup links and ids - remove ones that don't have a correponding opposite
@@ -77,14 +78,17 @@ class TextileMLizer(OEB2HTML):
             for i in self.our_links:
                 if i[0] == '#':
                     if i not in self.our_ids:
-                        text = re.sub(r'"(.+)":'+i, '\1', text)
+                        self.log.debug('Link has no target - %s ...' % i)
+                        text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
             for i in self.our_ids:
                 if i not in self.our_links:
-                    text = re.sub(r'\('+i+'\)', '', text)
+                    self.log.debug('ID has no link - %s ...' % i)
+                    text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
                     
-        # Note - I'm not checking for escaped '-' as this will also get hypenated words
-        text = check_escaping(text, ['\*', '_', '\+', '-'])
-#        text = check_escaping(text, ['\*', '_', '\+', '-'])
+        # Remove obvious non-needed escaping, add sub/sup-script ones
+        text = check_escaping(text, ['\*', '_', '\*'])
+        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
+        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
 
         text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
         text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
@@ -96,13 +100,14 @@ class TextileMLizer(OEB2HTML):
 #        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
         text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
 #        text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
-        text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
+        text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
+        text = re.sub(r'\n\n {2,4}%', r'%', text)                          #Check span following blank para
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
         text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
         text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
         text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)       # blank paragraph
         text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
-        text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text)
+        text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
         text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
 
         # Now put back spaces removed earlier as they're needed here
@@ -193,7 +198,8 @@ class TextileMLizer(OEB2HTML):
         return txt
 
     def prepare_string_for_textile(self, txt):
-        if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt):
+#        if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
+        if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
             return ' ==%s== ' % txt
         return txt
 
@@ -240,15 +246,23 @@ class TextileMLizer(OEB2HTML):
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                 if self.style_italic == False:
-                    text.append('[_')
-                    tags.append('_]')
+                    if self.in_a_link:
+                        text.append('_')
+                        tags.append('_')
+                    else:
+                        text.append('[_')
+                        tags.append('_]')
                     self.style_embed.append('_')
                     self.style_italic = True
         if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                 if self.style_bold == False:
-                    text.append('[*')
-                    tags.append('*]')
+                    if self.in_a_link:
+                        text.append('*')
+                        tags.append('*')
+                    else:
+                        text.append('[*')
+                        tags.append('*]')
                     self.style_embed.append('*')
                     self.style_bold = True
         if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
@@ -304,14 +318,17 @@ class TextileMLizer(OEB2HTML):
             tags.append('pre\n')
         elif tag == 'a':
             if self.opts.keep_links:
-                text.append('"')
-                tags.append('a')
                 if attribs.has_key('href'):
+                    text.append('"')
+                    tags.append('a')
                     tags.append('":' + attribs['href'])
                     self.our_links.append(attribs['href'])
-                if attribs.has_key('title'):
-                    tags.append('(' + attribs['title'] + ')')
-                self.in_a_link = True
+                    if attribs.has_key('title'):
+                        tags.append('(' + attribs['title'] + ')')
+                    self.in_a_link = True
+                else:
+                    text.append('%')
+                    tags.append('%')
         elif tag == 'img':
             if self.opts.keep_image_references:
                 txt = '!' + self.check_halign(style)
@@ -432,9 +449,9 @@ class TextileMLizer(OEB2HTML):
                     t = ''
                 text.append(self.id_no_text)
                 self.id_no_text = u''
-                if t == '*]':
+                if t in ('*]', '*'):
                     self.style_bold = False
-                elif t == '_]':
+                elif t in ('_]', '_'):
                     self.style_italic = False
                 elif t == '+]':
                     self.style_under = False
@@ -442,7 +459,7 @@ class TextileMLizer(OEB2HTML):
                     self.style_strike = False
                 elif t == '&':
                     self.style_smallcap = False
-                if t in ('*]', '_]', '+]', '-]'):
+                if t in ('*]', '_]', '+]', '-]', '*', '_'):
                     txt = self.style_embed.pop()
                 text.append('%s' % t)
 

From 3ca59beaf5c825fbb14af90b0108b7792a011924 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 9 May 2011 21:21:35 -0400
Subject: [PATCH 14/25] Add email.

---
 src/calibre/ebooks/textile/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py
index 0e1811f195..8a9c6b082a 100755
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@@ -12,7 +12,7 @@ A Humane Web Text Generator
 #__date__ = '2009/12/04'
 
 __copyright__ = """
-Copyright (c) 2011, Leigh Parry
+Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
 Copyright (c) 2011, John Schember <john@nachtimwald.com>
 Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
 Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/

From b95f9949be04a4d92eeabc76629cff0361817d47 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 10 May 2011 06:37:40 -0400
Subject: [PATCH 15/25] Rename function.

---
 src/calibre/ebooks/textile/functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py
index 8a9c6b082a..e088d264fc 100755
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@@ -705,7 +705,7 @@ class Textile(object):
             result.append(line)
         return ''.join(result)
 
-    def glyphs_only(self, text):
+    def macros_only(self, text):
         # fix: hackish
         text = re.sub(r'"\Z', '\" ', text)
 
@@ -828,7 +828,7 @@ class Textile(object):
         'fooobar ... and hello world ...'
         """
 
-        text = self.glyphs_only(text)
+        text = self.macros_only(text)
         punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
 
         pattern = r'''

From 441718f76c867da749a10607f931b8b03485d331 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 10 May 2011 18:55:19 -0400
Subject: [PATCH 16/25] TXT: small Textile changes. Remove old textile
 conversion code.

---
 src/calibre/ebooks/txt/textileml.py |  58 ++++----
 src/calibre/utils/html2textile.py   | 209 ----------------------------
 2 files changed, 34 insertions(+), 233 deletions(-)
 delete mode 100644 src/calibre/utils/html2textile.py

diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 814ba01a3e..17988053e8 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -78,44 +78,55 @@ class TextileMLizer(OEB2HTML):
             for i in self.our_links:
                 if i[0] == '#':
                     if i not in self.our_ids:
-                        self.log.debug('Link has no target - %s ...' % i)
                         text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
             for i in self.our_ids:
                 if i not in self.our_links:
-                    self.log.debug('ID has no link - %s ...' % i)
                     text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
                     
         # Remove obvious non-needed escaping, add sub/sup-script ones
         text = check_escaping(text, ['\*', '_', '\*'])
-        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
-        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
+        # escape the super/sub-scripts if needed
+        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
+        # escape the super/sub-scripts if needed
+        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
 
-        text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
-        text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
-        text = re.sub(r'%([_+*-]+)%', r'\1', text)                      #remove spans from tagged output
-        text = re.sub(r' +\n', r'\n', text)                             #remove spaces before a newline
-        text = re.sub(r'^\n+', r'', text)                               #remove newlines at top of file
-        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)              #correct blockcode paras
-        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)            #correct blockquote paras
-#        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
-        text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
-#        text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
+        #remove empty spans
+        text = re.sub(r'%\xa0+', r'%', text)
+        #remove empty spans - MAY MERGE SOME ?
+        text = re.sub(r'%%', r'', text)
+        #remove spans from tagged output
+        text = re.sub(r'%([_+*-]+)%', r'\1', text)
+        #remove spaces before a newline
+        text = re.sub(r' +\n', r'\n', text)
+        #remove newlines at top of file
+        text = re.sub(r'^\n+', r'', text)
+        #correct blockcode paras
+        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
+        #correct blockquote paras
+        text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
+
+        #reduce blank lines
+        text = re.sub(r'\n{3}', r'\n\n', text)
         text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
-        text = re.sub(r'\n\n {2,4}%', r'%', text)                          #Check span following blank para
+        #Check span following blank para
+        text = re.sub(r'\n+ +%', r' %', text)
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
-        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
-        text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
-        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)       # blank paragraph
+        # blank paragraph
+        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)
+        # blank paragraph
+        text = re.sub(u'\n\xa0',   r'\np. ', text)
+        # blank paragraph
+        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)
         text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
         text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
-        text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
+        #sort out spaces in tables
+        text = re.sub(r' {2,}\|', r' |', text)
 
         # Now put back spaces removed earlier as they're needed here
         text = re.sub(r'\np\.\n', r'\np. \n', text)
-        text = re.sub(r' \n\n\n', r' \n\n', text)                          #reduce blank lines
-        
-        # started work on trying to fix footnotes
-#        text = re.sub(r'\[\^"(\d+)":#.+\^\]', r'[\1]', text)
+        #reduce blank lines
+        text = re.sub(r' \n\n\n', r' \n\n', text)
+
         return text
 
     def remove_newlines(self, text):
@@ -198,7 +209,6 @@ class TextileMLizer(OEB2HTML):
         return txt
 
     def prepare_string_for_textile(self, txt):
-#        if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
         if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
             return ' ==%s== ' % txt
         return txt
diff --git a/src/calibre/utils/html2textile.py b/src/calibre/utils/html2textile.py
deleted file mode 100644
index 786e912e36..0000000000
--- a/src/calibre/utils/html2textile.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2010, Webreactor - Marcin Lulek <info@webreactor.eu>
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#    * Redistributions of source code must retain the above copyright
-#      notice, this list of conditions and the following disclaimer.
-#    * Redistributions in binary form must reproduce the above copyright
-#      notice, this list of conditions and the following disclaimer in the
-#      documentation and/or other materials provided with the distribution.
-#    * Neither the name of the <organization> nor the
-#      names of its contributors may be used to endorse or promote products
-#      derived from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-from lxml import etree
-from calibre.ebooks.oeb.base import barename
-
-class EchoTarget:
-
-    def __init__(self):
-        self.final_output = []
-        self.block = False
-        self.ol_ident = 0
-        self.ul_ident = 0
-        self.list_types = []
-        self.haystack = []
-
-    def start(self, tag, attrib):
-        tag = barename(tag)
-
-        newline = '\n'
-        dot = ''
-        new_tag = ''
-
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
-            new_tag = tag
-            dot = '. '
-        elif tag == 'p':
-                new_tag = ''
-                dot = ''
-        elif tag == 'blockquote':
-            new_tag = 'bq'
-            dot = '. '
-        elif tag in ('b', 'strong'):
-            new_tag = '*'
-            newline = ''
-        elif tag in ('em', 'i'):
-            new_tag = '_'
-            newline = ''
-        elif tag == 'cite':
-            new_tag = '??'
-            newline = ''
-        elif tag == 'del':
-            new_tag = '-'
-            newline = ''
-        elif tag == 'ins':
-            new_tag = '+'
-            newline = ''
-        elif tag == 'sup':
-            new_tag = '^'
-            newline = ''
-        elif tag == 'sub':
-            new_tag = '~'
-            newline = ''
-        elif tag == 'span':
-            new_tag = ''
-            newline = ''
-        elif tag == 'a':
-            self.block = True
-            if 'title' in attrib:
-                self.a_part = {'title':attrib.get('title'),
-                               'href':attrib.get('href', '')}
-            else:
-                self.a_part = {'title':None, 'href':attrib.get('href', '')}
-            new_tag = ''
-            newline = ''
-
-        elif tag == 'img':
-            if 'alt' in attrib:
-                new_tag = ' !%s(%s)' % (attrib.get('src'), attrib.get('title'),)
-            else:
-                new_tag = ' !%s' % attrib.get('src')
-            newline = ''
-
-        elif tag in ('ul', 'ol'):
-            new_tag = ''
-            newline = ''
-            self.list_types.append(tag)
-            if tag == 'ul':
-                self.ul_ident += 1
-            else:
-                self.ol_ident += 1
-
-        elif tag == 'li':
-            indent = self.ul_ident + self.ol_ident
-            if self.list_types[-1] == 'ul':
-                new_tag = '*' * indent + ' '
-                newline = '\n'
-            else:
-                new_tag = '#' * indent + ' '
-                newline = '\n'
-
-
-        if tag not in ('ul', 'ol'):
-            textile = '%(newline)s%(tag)s%(dot)s' % \
-                                 {
-                                  'newline':newline,
-                                  'tag':new_tag,
-                                  'dot':dot
-                                  }
-            if not self.block:
-                self.final_output.append(textile)
-            else:
-                self.haystack.append(textile)
-
-    def end(self, tag):
-        tag = barename(tag)
-
-        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
-            self.final_output.append('\n')
-        elif tag in ('b', 'strong'):
-            self.final_output.append('*')
-        elif tag in ('em', 'i'):
-            self.final_output.append('_')
-        elif tag == 'cite':
-            self.final_output.append('??')
-        elif tag == 'del':
-            self.final_output.append('-')
-        elif tag == 'ins':
-            self.final_output.append('+')
-        elif tag == 'sup':
-            self.final_output.append('^')
-        elif tag == 'sub':
-            self.final_output.append('~')
-        elif tag == 'span':
-            self.final_output.append('')
-        elif tag == 'a':
-            if self.a_part['title']:
-                textilized = ' "%s (%s)":%s ' % (
-                                                 ''.join(self.haystack),
-                                                 self.a_part.get('title'),
-                                                 self.a_part.get('href'),
-                                                 )
-                self.haystack = []
-            else:
-                textilized = ' "%s":%s ' % (
-                                                 ''.join(self.haystack),
-                                                 self.a_part.get('href'),
-                                                 )
-                self.haystack = []
-            self.final_output.append(textilized)
-            self.block = False
-        elif tag == 'img':
-            self.final_output.append('!')
-        elif tag == 'ul':
-            self.ul_ident -= 1
-            self.list_types.pop()
-            if len(self.list_types) == 0:
-                self.final_output.append('\n')
-        elif tag == 'ol':
-            self.ol_ident -= 1
-            self.list_types.pop()
-            if len(self.list_types) == 0:
-                self.final_output.append('\n')
-
-    def data(self, data):
-        #we dont want any linebreaks inside our tags
-        node_data = data.replace('\n','')
-        if not self.block:
-            self.final_output.append(node_data)
-        else:
-            self.haystack.append(node_data)
-
-    def comment(self, text):
-        pass
-
-    def close(self):
-        return "closed!"
-
-
-def html2textile(html):
-    #1st pass
-    #clean the whitespace and convert html to xhtml
-    parser = etree.HTMLParser()
-    tree = etree.fromstring(html, parser)
-    xhtml = etree.tostring(tree, method="xml")
-    parser = etree.XMLParser(remove_blank_text=True)
-    root = etree.XML(xhtml, parser)
-    cleaned_html = etree.tostring(root)
-    #2nd pass build textile
-    target = EchoTarget()
-    parser = etree.XMLParser(target=target)
-    root = etree.fromstring(cleaned_html, parser)
-    textilized_text = ''.join(target.final_output).lstrip().rstrip()
-    return textilized_text

From 5c1b683536ccb7fb221b13e35c4ae73db46cd35b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 10 May 2011 18:58:30 -0400
Subject: [PATCH 17/25] TXT: Add keep color GUI option.

---
 src/calibre/gui2/convert/txt_output.py | 2 +-
 src/calibre/gui2/convert/txt_output.ui | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py
index 8427f83824..816e8d7785 100644
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
         Widget.__init__(self, parent,
         ['newline', 'max_line_length', 'force_max_line_length',
         'inline_toc', 'txt_output_formatting', 'keep_links', 'keep_image_references',
-        'txt_output_encoding'])
+        'keep_color', 'txt_output_encoding'])
         self.db, self.book_id = db, book_id
         for x in get_option('newline').option.choices:
             self.opt_newline.addItem(x)
diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui
index 1ef9e6e6b9..36ffabb07e 100644
--- a/src/calibre/gui2/convert/txt_output.ui
+++ b/src/calibre/gui2/convert/txt_output.ui
@@ -122,6 +122,13 @@
         </property>
        </widget>
       </item>
+      <item>
+       <widget class="QCheckBox" name="opt_keep_color">
+        <property name="text">
+         <string>Do not remove font color before processing</string>
+        </property>
+       </widget>
+      </item>
      </layout>
     </widget>
    </item>

From 28dfc420d758cef69b9a4ea048406152b20636bb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 11:18:03 -0600
Subject: [PATCH 18/25] Fix #778208 (Fetch news from Readers Digest)

---
 recipes/readers_digest.recipe | 150 ++--------------------------------
 1 file changed, 9 insertions(+), 141 deletions(-)

diff --git a/recipes/readers_digest.recipe b/recipes/readers_digest.recipe
index 3689ca4c53..caf5cf081d 100644
--- a/recipes/readers_digest.recipe
+++ b/recipes/readers_digest.recipe
@@ -3,7 +3,6 @@ __license__   = 'GPL v3'
 '''
 '''
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.web.feeds import Feed
 
 
 class ReadersDigest(BasicNewsRecipe):
@@ -38,151 +37,20 @@ class ReadersDigest(BasicNewsRecipe):
                         '''
 
 
-    remove_tags = [
-        dict(name='h4', attrs={'class':'close'}),
-        dict(name='div', attrs={'class':'fromLine'}),
-        dict(name='img', attrs={'class':'colorTag'}),
-        dict(name='div', attrs={'id':'sponsorArticleHeader'}),
-        dict(name='div', attrs={'class':'horizontalAd'}),
-        dict(name='div', attrs={'id':'imageCounterLeft'}),
-        dict(name='div', attrs={'id':'commentsPrint'})
-        ]
-
-
     feeds = [
-            ('New in RD', 'http://feeds.rd.com/ReadersDigest'),
-            ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
-            ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
-            ('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
+            ('Food', 'http://www.rd.com/food/feed'),
+            ('Health', 'http://www.rd.com/health/feed'),
+            ('Home', 'http://www.rd.com/home/feed'),
+            ('Family', 'http://www.rd.com/family/feed'),
+            ('Money', 'http://www.rd.com/money/feed'),
+            ('Travel', 'http://www.rd.com/travel/feed'),
         ]
 
     cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
 
-
-
-#-------------------------------------------------------------------------------------------------
-
-    def print_version(self, url):
-
-        # Get the identity number of the current article and append it to the root print URL
-
-        if url.find('/article') > 0:
-            ident = url[url.find('/article')+8:url.find('.html?')-4]
-            url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
-
-        elif url.find('/post') > 0:
-
-            # in this case, have to get the page itself to derive the Print page.
-            soup = self.index_to_soup(url)
-            newsoup = soup.find('ul',attrs={'class':'printBlock'})
-            url = 'http://www.rd.com' + newsoup('a')[0]['href']
-            url = url[0:url.find('&Keep')]
-
-        return url
-
-#-------------------------------------------------------------------------------------------------
-
-    def parse_index(self):
-
-        pages = [
-                ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
-                # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
-                ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
-
+    keep_only_tags = dict(id='main-content')
+    remove_tags = [
+            {'class':['post-categories']},
             ]
 
-        feeds = []
-
-        for page in pages:
-            section, url, divider, attrList = page
-            newArticles = self.page_parse(url, divider, attrList)
-            feeds.append((section,newArticles))
-
-        # after the pages of the site have been processed, parse several RSS feeds for additional sections
-        newfeeds = Feed()
-        newfeeds = self.parse_rss()
-
-
-        # The utility code in parse_rss returns a Feed object.  Convert each feed/article combination into a form suitable
-        # for this module (parse_index).
-
-        for feed in newfeeds:
-            newArticles = []
-            for article in feed.articles:
-                newArt = {
-                            'title' : article.title,
-                            'url'   : article.url,
-                            'date'  : article.date,
-                            'description' : article.text_summary
-                        }
-                newArticles.append(newArt)
-
-
-            # New and Blogs should be the first two feeds.
-            if feed.title == 'New in RD':
-                feeds.insert(0,(feed.title,newArticles))
-            elif feed.title == 'Blogs':
-                feeds.insert(1,(feed.title,newArticles))
-            else:
-                feeds.append((feed.title,newArticles))
-
-
-        return feeds
-
-#-------------------------------------------------------------------------------------------------
-
-    def page_parse(self, mainurl, divider, attrList):
-
-        articles = []
-        mainsoup = self.index_to_soup(mainurl)
-        for item in mainsoup.findAll(attrs=attrList):
-            newArticle = {
-                        'title' : item('img')[0]['alt'],
-                        'url'   : 'http://www.rd.com'+item('a')[0]['href'],
-                        'date'  : '',
-                        'description' : ''
-                    }
-            articles.append(newArticle)
-
-
-
-        return articles
-
-
-
-#-------------------------------------------------------------------------------------------------
-
-    def parse_rss (self):
-
-        # Do the "official" parse_feeds first
-        feeds = BasicNewsRecipe.parse_feeds(self)
-
-
-        # Loop thru the articles in all feeds to find articles with "recipe" in it
-        recipeArticles = []
-        for curfeed in feeds:
-            delList = []
-            for a,curarticle in enumerate(curfeed.articles):
-                if curarticle.title.upper().find('RECIPE') >= 0:
-                    recipeArticles.append(curarticle)
-                    delList.append(curarticle)
-            if len(delList)>0:
-                for d in delList:
-                    index = curfeed.articles.index(d)
-                    curfeed.articles[index:index+1] = []
-
-        # If there are any recipes found, create a new Feed object and append.
-        if len(recipeArticles) > 0:
-            pfeed = Feed()
-            pfeed.title = 'Recipes'
-            pfeed.descrition = 'Recipe Feed (Virtual)'
-            pfeed.image_url  = None
-            pfeed.oldest_article = 30
-            pfeed.id_counter = len(recipeArticles)
-            # Create a new Feed, add the recipe articles, and then append
-            # to "official" list of feeds
-            pfeed.articles = recipeArticles[:]
-            feeds.append(pfeed)
-
-        return feeds
 

From 751890a83f5fef83968c0de39313980c7be3d7e7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 13:15:52 -0600
Subject: [PATCH 19/25] ...

---
 src/calibre/ebooks/metadata/sources/identify.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py
index b084f86294..0cc070c3c6 100644
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@@ -382,7 +382,7 @@ def identify(log, abort, # {{{
             if key not in filter_results:
                 filtered_results.append(r)
                 filter_results.add(key)
-        presults = filtered_results
+        results[plugin] = presults = filtered_results
 
         plog = logs[plugin].getvalue().strip()
         log('\n'+'*'*30, plugin.name, '*'*30)

From e19edba3efe5fa257591ed0fe1fbfb286317257d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 14:31:21 -0600
Subject: [PATCH 20/25] EPUB Input: Ignore missing cover file when converting,
 instead of erroring out. Fixes #781848 ([Errno 2] No such file or directory
 while converting)

---
 src/calibre/ebooks/epub/input.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py
index 917c5ad8ae..ac1d61ce59 100644
--- a/src/calibre/ebooks/epub/input.py
+++ b/src/calibre/ebooks/epub/input.py
@@ -103,10 +103,11 @@ class EPUBInput(InputFormatPlugin):
         t.set('href', guide_cover)
         t.set('title', 'Title Page')
         from calibre.ebooks import render_html_svg_workaround
-        renderer = render_html_svg_workaround(guide_cover, log)
-        if renderer is not None:
-            open('calibre_raster_cover.jpg', 'wb').write(
-                renderer)
+        if os.path.exists(guide_cover):
+            renderer = render_html_svg_workaround(guide_cover, log)
+            if renderer is not None:
+                open('calibre_raster_cover.jpg', 'wb').write(
+                    renderer)
 
     def find_opf(self):
         def attr(n, attr):

From 953c8e939558ed380ae0a817cd89303a6fc959f7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 15:05:55 -0600
Subject: [PATCH 21/25] Allow the use of condensed/expanded fonts as interface
 fonts

---
 src/calibre/gui2/__init__.py              |  6 +++++-
 src/calibre/gui2/preferences/look_feel.py | 18 +++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py
index 1dfe1d8d14..28504f2a31 100644
--- a/src/calibre/gui2/__init__.py
+++ b/src/calibre/gui2/__init__.py
@@ -620,7 +620,11 @@ class Application(QApplication):
         self.original_font = QFont(QApplication.font())
         fi = gprefs['font']
         if fi is not None:
-            QApplication.setFont(QFont(*fi))
+            font = QFont(*(fi[:4]))
+            s = gprefs.get('font_stretch', None)
+            if s is not None:
+                font.setStretch(s)
+            QApplication.setFont(font)
 
     def _send_file_open_events(self):
         with self._file_open_lock:
diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py
index 620113cc3f..ee2d7a5428 100644
--- a/src/calibre/gui2/preferences/look_feel.py
+++ b/src/calibre/gui2/preferences/look_feel.py
@@ -161,7 +161,11 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
 
     def initialize(self):
         ConfigWidgetBase.initialize(self)
-        self.current_font = self.initial_font = gprefs['font']
+        font = gprefs['font']
+        if font is not None:
+            font = list(font)
+            font.append(gprefs.get('font_stretch', QFont.Unstretched))
+        self.current_font = self.initial_font = font
         self.update_font_display()
         self.display_model.initialize()
 
@@ -178,7 +182,8 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
     def build_font_obj(self):
         font_info = self.current_font
         if font_info is not None:
-            font = QFont(*font_info)
+            font = QFont(*(font_info[:4]))
+            font.setStretch(font_info[4])
         else:
             font = qt_app.original_font
         return font
@@ -215,15 +220,18 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
         if fd.exec_() == fd.Accepted:
             font = fd.selectedFont()
             fi = QFontInfo(font)
-            self.current_font = (unicode(fi.family()), fi.pointSize(),
-                    fi.weight(), fi.italic())
+            self.current_font = [unicode(fi.family()), fi.pointSize(),
+                    fi.weight(), fi.italic(), font.stretch()]
             self.update_font_display()
             self.changed_signal.emit()
 
     def commit(self, *args):
         rr = ConfigWidgetBase.commit(self, *args)
         if self.current_font != self.initial_font:
-            gprefs['font'] = self.current_font
+            gprefs['font'] = (self.current_font[:4] if self.current_font else
+                    None)
+            gprefs['font_stretch'] = (self.current_font[4] if self.current_font
+                    is not None else QFont.Unstretched)
             QApplication.setFont(self.font_display.font())
             rr = True
         self.display_model.commit()

From af23efd3d6992b488803048f48efb1a1a1f7b908 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 15:09:41 -0600
Subject: [PATCH 22/25] Fix Strategy+Business

---
 recipes/strategy-business.recipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/strategy-business.recipe b/recipes/strategy-business.recipe
index ab58965e98..a4697ecfcd 100644
--- a/recipes/strategy-business.recipe
+++ b/recipes/strategy-business.recipe
@@ -33,7 +33,7 @@ class StrategyBusinessRecipe(BasicNewsRecipe):
                     elif c.name.endswith('_password'):
                         br[c.name] = self.password
                 raw = br.submit().read()
-                if '>Logout' not in raw:
+                if 'You have been logged in' not in raw:
                     raise ValueError('Failed to login, check your username and password')
         return br
 

From dc0834e8bcfdd49a84ab68cafec51d0433ab7988 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 12 May 2011 18:06:17 -0400
Subject: [PATCH 23/25] TXT: Textileml tweaks.

---
 src/calibre/ebooks/txt/textileml.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 17988053e8..36dc9952d2 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -106,17 +106,17 @@ class TextileMLizer(OEB2HTML):
         text = re.sub(r'\nbq\.\n?\np.*\. ', r'\nbq. ', text)
 
         #reduce blank lines
-        text = re.sub(r'\n{3}', r'\n\n', text)
+        text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
         text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
         #Check span following blank para
         text = re.sub(r'\n+ +%', r' %', text)
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
         # blank paragraph
-        text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)
+        text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
         # blank paragraph
-        text = re.sub(u'\n\xa0',   r'\np. ', text)
+        text = re.sub(u'\n\xa0', r'\np. ', text)
         # blank paragraph
-        text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)
+        text = re.sub(u'\np[<>=]{1,2}?\. \xa0', r'\np. ', text)
         text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
         text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
         #sort out spaces in tables

From dc74afe1f272de99c99480a3a10b312a6fc48176 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 16:54:20 -0600
Subject: [PATCH 24/25] ...

---
 src/calibre/utils/Zeroconf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/utils/Zeroconf.py b/src/calibre/utils/Zeroconf.py
index fbb9b4e71f..2b3661162f 100755
--- a/src/calibre/utils/Zeroconf.py
+++ b/src/calibre/utils/Zeroconf.py
@@ -869,7 +869,8 @@ class Engine(threading.Thread):
                             if DEBUG:
                                 traceback.print_exc()
                         except:
-                            traceback.print_exc()
+                            if DEBUG:
+                                traceback.print_exc()
                 except:
                     pass
 

From 936a6892dcaea49b3dba3e353da8af874a39a5f0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 12 May 2011 17:15:13 -0600
Subject: [PATCH 25/25] ...

---
 src/calibre/ebooks/metadata/sources/amazon.py    | 2 +-
 src/calibre/ebooks/metadata/sources/google.py    | 2 +-
 src/calibre/ebooks/metadata/sources/overdrive.py | 2 +-
 src/calibre/gui2/preferences/metadata_sources.py | 5 +++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 31d815af63..40cd54cfbd 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -280,7 +280,7 @@ class Worker(Thread): # Get details {{{
 class Amazon(Source):
 
     name = 'Amazon.com'
-    description = _('Downloads metadata from Amazon')
+    description = _('Downloads metadata and covers from Amazon')
 
     capabilities = frozenset(['identify', 'cover'])
     touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index b479368bac..bd1043b774 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -157,7 +157,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
 class GoogleBooks(Source):
 
     name = 'Google'
-    description = _('Downloads metadata from Google Books')
+    description = _('Downloads metadata and covers from Google Books')
 
     capabilities = frozenset(['identify', 'cover'])
     touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index 4ee248579e..f52b1f423b 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -30,7 +30,7 @@ base_url = 'http://search.overdrive.com/'
 class OverDrive(Source):
 
     name = 'Overdrive'
-    description = _('Downloads metadata from Overdrive\'s Content Reserve')
+    description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')
 
     capabilities = frozenset(['identify', 'cover'])
     touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
diff --git a/src/calibre/gui2/preferences/metadata_sources.py b/src/calibre/gui2/preferences/metadata_sources.py
index 05ff23987d..f7465fb0ee 100644
--- a/src/calibre/gui2/preferences/metadata_sources.py
+++ b/src/calibre/gui2/preferences/metadata_sources.py
@@ -71,9 +71,10 @@ class SourcesModel(QAbstractTableModel): # {{{
                     plugin.is_configured()):
             return QIcon(I('list_remove.png'))
         elif role == Qt.ToolTipRole:
+            base = plugin.description + '\n\n'
             if plugin.is_configured():
-                return _('This source is configured and ready to go')
-            return _('This source needs configuration')
+                return base + _('This source is configured and ready to go')
+            return base + _('This source needs configuration')
         return NONE
 
     def setData(self, index, val, role):