From 842ba755575c108fc0c8ab93cac383185776f212 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 9 May 2011 21:19:28 -0400
Subject: [PATCH] More changes.

---
 src/calibre/ebooks/textile/functions.py | 34 +++++++++-----
 src/calibre/ebooks/txt/textileml.py     | 59 ++++++++++++++++---------
 2 files changed, 61 insertions(+), 32 deletions(-)
diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py
index b186e79ad4..0e1811f195 100755
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@@ -12,7 +12,7 @@ A Humane Web Text Generator
 #__date__ = '2009/12/04'
 
 __copyright__ = """
-Copyright (c) 2011, Leigh Parry <leighparry@blueyonder.co.uk>
+Copyright (c) 2011, Leigh Parry
 Copyright (c) 2011, John Schember <john@nachtimwald.com>
 Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
 Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
@@ -219,14 +219,13 @@ class Textile(object):
     ]
     glyph_defaults = [
         (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'),                   r'\1\2&#215;\3'),                       #  dimension sign
-        (re.compile(r'(\d+)\'', re.I),                                 r'\1&#8242;'),                          #  prime
-        (re.compile(r'(\d+)\"', re.I),                                 r'\1&#8243;'),                          #  prime-double
+        (re.compile(r'(\d+)\'(\s)', re.I),                             r'\1&#8242;\2'),                          #  prime
+        (re.compile(r'(\d+)\"(\s)', re.I),                             r'\1&#8243;\2'),                          #  prime-double
         (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'),      r'<acronym title="\2">\1</acronym>'),   #  3+ uppercase acronym
         (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'),         r'<span class="caps">\1</span>'),       #  3+ uppercase
         (re.compile(r'\b(\s{0,1})?\.{3}'),                             r'\1&#8230;'),                          #  ellipsis
         (re.compile(r'^[\*_-]{3,}$', re.M),                            r'<hr />'),                             #  <hr> scene-break
-#        (re.compile(r'\b--\b'),                                        r'&#8212;'),                            #  em dash
-        (re.compile(r'([^-])--([^-])'),                                r'\1&#8212;\2'),                        #  em dash
+        (re.compile(r'(^|[^-])--([^-]|$)'),                                r'\1&#8212;\2'),                        #  em dash
         (re.compile(r'\s-(?:\s|$)'),                                   r' &#8211; '),                          #  en dash
         (re.compile(r'\b( ?)[([]TM[])]', re.I),                        r'\1&#8482;'),                          #  trademark
         (re.compile(r'\b( ?)[([]R[])]', re.I),                         r'\1&#174;'),                           #  registered
@@ -706,6 +705,21 @@ class Textile(object):
             result.append(line)
         return ''.join(result)
 
+    def glyphs_only(self, text):
+        # fix: hackish
+        text = re.sub(r'"\Z', '\" ', text)
+
+        result = []
+        for line in re.compile(r'(<.*?>)', re.U).split(text):
+            if not re.search(r'<.*>', line):
+                rules = []
+                if re.search(r'{.+?}', line):
+                    rules = self.macro_defaults
+                for s, r in rules:
+                    line = s.sub(r, line)
+            result.append(line)
+        return ''.join(result)
+
     def vAlign(self, input):
         d = {'^':'top', '-':'middle', '~':'bottom'}
         return d.get(input, '')
@@ -792,7 +806,6 @@ class Textile(object):
             text = self.noTextile(text)
             text = self.code(text)
 
-        text = self.glyphs(text)
         text = self.links(text)
 
         if not self.noimage:
@@ -804,6 +817,7 @@ class Textile(object):
 
         text = self.span(text)
         text = self.footnoteRef(text)
+        text = self.glyphs(text)
 
         return text.rstrip('\n')
 
@@ -814,6 +828,7 @@ class Textile(object):
         'fooobar ... and hello world ...'
         """
 
+        text = self.glyphs_only(text)
         punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
 
         pattern = r'''
@@ -868,7 +883,7 @@ class Textile(object):
         >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
         'hello <span class="bob">span <strong>strong</strong> and <b>bold</b></span> goodbye'
         """
-        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^', r'&')
+        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
         pnct = ".,\"'?!;:"
 
         for qtag in qtags:
@@ -900,9 +915,7 @@ class Textile(object):
             '%' : 'span',
             '+' : 'ins',
             '~' : 'sub',
-            '^' : 'sup',
-            '&' : 'span style="font-variant:small-caps;"'
-#            '&' : 'span style="font-transform:uppercase;font-size:smaller;"'
+            '^' : 'sup'
         }
         tag = qtags[tag]
         atts = self.pba(atts)
@@ -1046,4 +1059,3 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'):
     return Textile(restricted=True, lite=lite,
                    noimage=noimage).textile(text, rel='nofollow',
                                             html_type=html_type)
-
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 31c118251d..814ba01a3e 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -69,7 +69,8 @@ class TextileMLizer(OEB2HTML):
                 txt = '%s' % t
                 if txt != '%':
                     text = re.sub(r'([^'+t+'|^\n])'+t+'\]\['+t+'([^'+t+'])', r'\1\2', text)
-                text = re.sub(r'(\s|[*_])\[('+t+'[a-zA-Z0-9 \',.*_]+'+t+')\](\s|[*_])', r'\1\2\3', text)
+                    text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
+                text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
             return text
 
         # Now tidyup links and ids - remove ones that don't have a correponding opposite
@@ -77,14 +78,17 @@ class TextileMLizer(OEB2HTML):
             for i in self.our_links:
                 if i[0] == '#':
                     if i not in self.our_ids:
-                        text = re.sub(r'"(.+)":'+i, '\1', text)
+                        self.log.debug('Link has no target - %s ...' % i)
+                        text = re.sub(r'"(.+)":'+i+'(\s)', r'\1\2', text)
             for i in self.our_ids:
                 if i not in self.our_links:
-                    text = re.sub(r'\('+i+'\)', '', text)
+                    self.log.debug('ID has no link - %s ...' % i)
+                    text = re.sub(r'%?\('+i+'\)\xa0?%?', r'', text)
                     
-        # Note - I'm not checking for escaped '-' as this will also get hypenated words
-        text = check_escaping(text, ['\*', '_', '\+', '-'])
-#        text = check_escaping(text, ['\*', '_', '\+', '-'])
+        # Remove obvious non-needed escaping, add sub/sup-script ones
+        text = check_escaping(text, ['\*', '_', '\*'])
+        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) # escape the super/sub-scripts if needed
+        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) # escape the super/sub-scripts if needed
 
         text = re.sub(r'%\xa0+', r'%', text)                            #remove empty spans
         text = re.sub(r'%%', r'', text)                                 #remove empty spans - MAY MERGE SOME ?
@@ -96,13 +100,14 @@ class TextileMLizer(OEB2HTML):
 #        text = re.sub(r'\n{4,}', r'\n\np. \n\n', text)                  #reduce blank lines + insert blank para
         text = re.sub(r'\n{3}', r'\n\n', text)                          #reduce blank lines
 #        text = re.sub(r' ((\* ?)+) ', r' ==\1== ', text)
-        text = re.sub(u'%\n(p[<>=]{1,2}\.)', r'%\n\n\1', text)
+        text = re.sub(u'%\n(p[<>=]{1,2}\.|p\.)', r'%\n\n\1', text)
+        text = re.sub(r'\n\n {2,4}%', r'%', text)                          #Check span following blank para
         text = re.sub(u'p[<>=]{1,2}\.\n\n?', r'', text)
         text = re.sub(r'(^|\n)p\.\n', r'\1p. \n', text)                # blank paragraph
         text = re.sub(u'\n\xa0',   r'\np. ', text)                     # blank paragraph
         text = re.sub(u'\np[<>=]{1,2}?\. \xa0',   r'\np. ', text)       # blank paragraph
         text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
-        text = re.sub(r'\n(p\. \n)(p.*\.)', r'\n\2', text)
+        text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
         text = re.sub(r' {2,}\|', r' |', text)                               #sort out spaces in tables
 
         # Now put back spaces removed earlier as they're needed here
@@ -193,7 +198,8 @@ class TextileMLizer(OEB2HTML):
         return txt
 
     def prepare_string_for_textile(self, txt):
-        if re.search(r'(\s([*&_+\-=~@%|]|\?{2}))|(([*&_+\-=~@%|]|\?{2})\s)', txt):
+#        if re.search(r'(\s([*&_+\-~@%|]|\?{2}))|(([*&_+\-~@%|]|\?{2})\s)', txt):
+        if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
             return ' ==%s== ' % txt
         return txt
 
@@ -240,15 +246,23 @@ class TextileMLizer(OEB2HTML):
         if style['font-style'] == 'italic' or tag in ('i', 'em'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                 if self.style_italic == False:
-                    text.append('[_')
-                    tags.append('_]')
+                    if self.in_a_link:
+                        text.append('_')
+                        tags.append('_')
+                    else:
+                        text.append('[_')
+                        tags.append('_]')
                     self.style_embed.append('_')
                     self.style_italic = True
         if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
             if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                 if self.style_bold == False:
-                    text.append('[*')
-                    tags.append('*]')
+                    if self.in_a_link:
+                        text.append('*')
+                        tags.append('*')
+                    else:
+                        text.append('[*')
+                        tags.append('*]')
                     self.style_embed.append('*')
                     self.style_bold = True
         if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
@@ -304,14 +318,17 @@ class TextileMLizer(OEB2HTML):
             tags.append('pre\n')
         elif tag == 'a':
             if self.opts.keep_links:
-                text.append('"')
-                tags.append('a')
                 if attribs.has_key('href'):
+                    text.append('"')
+                    tags.append('a')
                     tags.append('":' + attribs['href'])
                     self.our_links.append(attribs['href'])
-                if attribs.has_key('title'):
-                    tags.append('(' + attribs['title'] + ')')
-                self.in_a_link = True
+                    if attribs.has_key('title'):
+                        tags.append('(' + attribs['title'] + ')')
+                    self.in_a_link = True
+                else:
+                    text.append('%')
+                    tags.append('%')
         elif tag == 'img':
             if self.opts.keep_image_references:
                 txt = '!' + self.check_halign(style)
@@ -432,9 +449,9 @@ class TextileMLizer(OEB2HTML):
                     t = ''
                 text.append(self.id_no_text)
                 self.id_no_text = u''
-                if t == '*]':
+                if t in ('*]', '*'):
                     self.style_bold = False
-                elif t == '_]':
+                elif t in ('_]', '_'):
                     self.style_italic = False
                 elif t == '+]':
                     self.style_under = False
@@ -442,7 +459,7 @@ class TextileMLizer(OEB2HTML):
                     self.style_strike = False
                 elif t == '&':
                     self.style_smallcap = False
-                if t in ('*]', '_]', '+]', '-]'):
+                if t in ('*]', '_]', '+]', '-]', '*', '_'):
                     txt = self.style_embed.pop()
                 text.append('%s' % t)