From 804b248d46c71e5169c57da794ec2f69f2998dbf Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 16 Apr 2011 11:55:44 -0400
Subject: [PATCH] Add new but still wip textile output generator.

---
 src/calibre/ebooks/txt/output.py    |  21 +-
 src/calibre/ebooks/txt/textileml.py | 341 +++++++++++++++++++++++++---
 src/calibre/ebooks/txt/unsmarten.py | 109 +++++++++
 3 files changed, 432 insertions(+), 39 deletions(-)
 create mode 100644 src/calibre/ebooks/txt/unsmarten.py

diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 4e54a97b45..7b50afb345 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -70,16 +70,17 @@ class TXTOutput(OutputFormatPlugin):
      ])
 
     def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        print 'New'
         if opts.txt_output_formatting.lower() == 'markdown':
             from calibre.ebooks.txt.markdownml import MarkdownMLizer
-            writer = MarkdownMLizer(log)
+            self.writer = MarkdownMLizer(log)
         elif opts.txt_output_formatting.lower() == 'textile':
             from calibre.ebooks.txt.textileml import TextileMLizer
-            writer = TextileMLizer(log)
+            self.writer = TextileMLizer(log)
         else:
-            writer = TXTMLizer(log)
+            self.writer = TXTMLizer(log)
 
-        txt = writer.extract_content(oeb_book, opts)
+        txt = self.writer.extract_content(oeb_book, opts)
         txt = clean_ascii_chars(txt)
 
         log.debug('\tReplacing newlines with selected type...')
@@ -118,10 +119,18 @@ class TXTZOutput(TXTOutput):
             # Images
             for item in oeb_book.manifest:
                 if item.media_type in OEB_IMAGES:
-                    path = os.path.join(tdir, os.path.dirname(item.href))
+                    if hasattr(self.writer, 'images'):
+                        path = os.path.join(tdir, 'images')
+                        if item.href in self.writer.images:
+                            href = self.writer.images[item.href]
+                        else:
+                            continue
+                    else:
+                        path = os.path.join(tdir, os.path.dirname(item.href))
+                        href = os.path.basename(item.href)
                     if not os.path.exists(path):
                         os.makedirs(path)
-                    with open(os.path.join(tdir, item.href), 'wb') as imgf:
+                    with open(os.path.join(path, href), 'wb') as imgf:
                         imgf.write(item.data)
             
             # Metadata
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index d7e11695c5..9651fa8971 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 __license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
 __docformat__ = 'restructuredtext en'
 
 '''
@@ -10,53 +10,328 @@ Transform OEB content into Textile formatted plain text
 
 import re
 
-from lxml import etree
+from functools import partial
 
-from calibre.ebooks.oeb.base import XHTML
-from calibre.utils.html2textile import html2textile
+from calibre.ebooks.htmlz.oeb2html import OEB2HTML
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
+from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.ebooks.txt.unsmarten import unsmarten
+from operator import itemgetter
 
-class TextileMLizer(object):
 
-    def __init__(self, log):
-        self.log = log
+class TextileMLizer(OEB2HTML):
 
     def extract_content(self, oeb_book, opts):
         self.log.info('Converting XHTML to Textile formatted TXT...')
-        self.oeb_book = oeb_book
         self.opts = opts
+        self.in_pre = False
+        self.in_table = False
+        self.links = {}
+        self.list = []
+        self.images = {}
+        self.base_hrefs = [item.href for item in oeb_book.spine]
+        self.map_resources(oeb_book)
 
-        return self.mlize_spine()
+        self.style_bold = False
+        self.style_italic = False
+        self.style_under = False
+        self.style_strike = False
+        self.style_smallcap = False
 
-    def mlize_spine(self):
+        txt = self.mlize_spine(oeb_book)
+        txt = unsmarten(txt)
+
+        # Do some tidying up
+        txt = self.tidy_up(txt)
+
+        return txt
+
+    def mlize_spine(self, oeb_book):
         output = [u'']
-
-        for item in self.oeb_book.spine:
+        for item in oeb_book.spine:
             self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
+            self.rewrite_ids(item.data, item)
+            rewrite_links(item.data, partial(self.rewrite_link, page=item))
+            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+            output.append('\n\n')
+        return ''.join(output)
 
-            html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
+    def tidy_up(self, text):
+        def check_count(text, tests):
+            x = []
+            for i, t in enumerate(reversed(tests)):
+                x.append((text.count(t), i, t))
+            if x:
+                return sorted(x, key=itemgetter(0, 1), reverse=True)[0][2]
+            return ''
 
-            if not self.opts.keep_links:
-                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
-            if not self.opts.keep_image_references:
-                html = re.sub(r'<\s*img[^>]*>', '', html)
+        # NEEDS TWEAKING
+#        def check_escaping(text, tests):
+#            for t in tests:
+#                text = re.sub(r'(\S)('+t+'\w+'+t+')', r'\1[\2]', text)
+#                text = re.sub(r'(\s)('+t+'\w+'+t+')(\S)', r'\1[\2]\3', text)
+#            return text
 
-            text = html2textile(html)
+        txt = check_count(text, ['\np<. ', '\np<>. ', '\np. '])
+        text = re.sub(txt+'(\S)', r'\n\1', text)
 
-            # Ensure the section ends with at least two new line characters.
-            # This is to prevent the last paragraph from a section being
-            # combined into the fist paragraph of the next.
-            end_chars = text[-4:]
-            # Convert all newlines to \n
-            end_chars = end_chars.replace('\r\n', '\n')
-            end_chars = end_chars.replace('\r', '\n')
-            end_chars = end_chars[-2:]
-            if not end_chars[1] == '\n':
-                text += '\n\n'
-            if end_chars[1] == '\n' and not end_chars[0] == '\n':
-                text += '\n'
+#        text = check_escaping(text, ['\^', '\*', '_', '\+', '~', '-'])
 
-            output += text
+        text = re.sub('\npre\. bc\.', '\nbc.', text)
+        text = re.sub('\np=. p. ', '\np. ', text)
+        text = re.sub('\np=. \n', '\n', text)
+        text = re.sub('\n{3,}', '\n\n', text)
+        text = re.sub(' \|', '|', text)
 
-        output = u''.join(output)
+        # started work on trying to fix footnotes
+#        text = re.sub(r'\^"(\d+)":#.+\^', r'[\1]', text)
+        return text
 
-        return output
+    def remove_newlines(self, text):
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)
+        text = re.sub(r'\t+', '', text)
+        return text
+
+    def remove_leading_ws(self, text):
+        text = text.replace('\r\n', '\n')
+        text = text.replace('\r', '\n')
+        text = re.sub(r'\n+', '\n', text)
+        text = re.sub(r'\n[\t ]+', '\n', text)
+        return text
+
+    def check_align(self, style, align, tests):
+        for i in tests:
+            if style[align] == i[0]:
+                return i[1]
+        return ''
+
+    def check_padding(self, style, tests):
+        txt = ''
+        for i in tests:
+            try:
+                ems = int(round(float(style[i[0]] / style['font-size'])))
+                if ems >=1:
+                    txt += i[1] * ems
+            except:
+                pass
+        return txt
+
+    def check_id_tag(self, attribs):
+        txt = ''
+        if attribs.has_key('id'):
+            txt = '(#'+attribs['id']+')'
+        return txt
+
+    def build_block(self, tag, style, attribs, finish):
+        txt = tag
+        if self.opts.keep_links:
+            txt += self.check_id_tag(attribs)
+        txt += self.check_padding(style, [['padding-left','('],['padding-right',')']])
+        txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
+        txt += finish
+        return txt
+
+    def dump_text(self, elem, stylizer, page, tag_stack=[]):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        '''
+
+        # We can only processes tags. If there isn't a tag return any text.
+        if not isinstance(elem.tag, basestring) \
+           or namespace(elem.tag) != XHTML_NS:
+            p = elem.getparent()
+            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
+                    and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        # Setup our variables.
+        text = ['']
+        style = stylizer.style(elem)
+        tags = []
+        tag = barename(elem.tag)
+        attribs = elem.attrib
+
+        # Ignore anything that is set to not be displayed.
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            return ['']
+
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
+            text.append(self.build_block(tag, style, attribs, '. '))
+            tags.append('\n')
+
+        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
+                if self.style_bold == False:
+                    text.append('*')
+                    tags.append('*')
+                    self.style_bold = True
+        if style['font-style'] == 'italic' or tag in ('i', 'em'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
+                if self.style_italic == False:
+                    text.append('_')
+                    tags.append('_')
+                    self.style_italic = True
+        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
+            if tag != 'a':
+                if self.style_under == False:
+                    text.append('+')
+                    tags.append('+')
+                    self.style_under = True
+        if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
+            if self.style_strike == False:
+                text.append('-')
+                tags.append('-')
+                self.style_strike = True
+        if style['font-variant'] == 'small-caps':
+            if self.style_smallcap == 0:
+                text.append('&')
+                tags.append('&')
+                self.style_smallcap = 1
+        if tag == 'br':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'blockquote':
+            text.append('bq. ')
+            tags.append('\n')
+        elif tag in ('abbr', 'acronym'):
+            text.append('')
+            txt = attribs['title']
+            tags.append('(' + txt + ')')
+        elif tag == 'sup':
+            text.append('^')
+            tags.append('^')
+        elif tag == 'sub':
+            text.append('~')
+            tags.append('~')
+        elif tag == 'code':
+            if self.in_pre:
+                text.append('bc. ')
+                tags.append('\n')
+            else:
+                text.append('@')
+                tags.append('@')
+        elif tag == 'cite':
+            text.append('??')
+            tags.append('??')
+        elif tag == 'hr':
+            text.append('\n***\n')
+            tags.append('\n')
+        elif tag == 'pre':
+            self.in_pre = True
+            text.append('pre. ')
+            tags.append('pre')
+        elif tag == 'a':
+            if self.opts.keep_links:
+                text.append ('"')
+                tags.append('":' + attribs['href'])
+                if attribs.has_key('title'):
+                    tags.append('(' + attribs['title'] + ')')
+        elif tag == 'img':
+            if self.opts.keep_image_references:
+                text.append ('!' + attribs['src'])
+                if attribs.has_key('alt'):
+                    txt = attribs['alt']
+                    if txt != '':
+                        text.append('(' + txt + ')')
+                tags.append('!')
+        elif tag in ('ol', 'ul'):
+            self.list.append({'name':tag, 'num':0})
+            text.append('')
+            tags.append(tag)
+        elif tag == 'li':
+            if self.list: li = self.list[-1]
+            else: li = {'name':'ul', 'num':0}
+            if   li['name'] == 'ul': text.append('*'*len(self.list)+' ')
+            elif li['name'] == 'ol': text.append('#'*len(self.list)+' ')
+        elif tag == 'dl':
+            text.append('\n')
+            tags.append('')
+        elif tag == 'dt':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'dd':
+            text.append('    ')
+            tags.append('')
+        elif tag == 'dd':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'table':
+            self.in_table = True
+            text.append('')
+            tags.append('table')
+        elif tag == 'tr':
+            text.append('')
+            tags.append('|\n')
+        elif tag == 'td':
+            text.append('|')
+            txt = ''
+            txt += self.check_align(style, 'text-align', [['left','<'],['justify','<>'],['center','='],['right','>']])
+            txt += self.check_align(style, 'vertical-align', [['top','^'],['bottom','~']])
+            if attribs.has_key ('colspan'):
+                txt += '\\' + attribs['colspan']
+            if attribs.has_key ('rowspan'):
+                txt += '/' + attribs['rowspan']
+            if txt != '':
+                text.append(txt+'. ')
+            tags.append('')
+        elif tag == 'th':
+            text.append('|_. ')
+            tags.append('')
+
+        if self.opts.keep_links and attribs.has_key('id'):
+            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'):
+                text.append('(#' + attribs['id'] + ')')
+
+        # If wanted process all style tags here - before taxt in tags is written
+
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            txt = elem.text
+            if not self.in_pre:
+                if self.in_table:
+                    txt = self.remove_newlines(txt)
+                else:
+                    txt = self.remove_leading_ws(txt)
+            text.append(txt)
+
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += self.dump_text(item, stylizer, page, tag_stack+tags)
+
+        # Close all open tags.
+        tags.reverse()
+        for t in tags:
+            if tag in ('pre', 'ul', 'ol', 'li', 'table'):
+                if tag == 'pre':
+                    self.in_pre = False
+                if tag == 'table':
+                    self.in_table = False
+                if tag in ('ul', 'ol'):
+                    if self.list: self.list.pop()
+            else:
+                text.append('%s' % t)
+                if t == '*': self.style_bold = False
+                if t == '_': self.style_italic = False
+                if t == '+': self.style_under = False
+                if t == '-': self.style_strike = False
+                if t == '&': self.style_smallcap = False
+
+        # Add the text that is outside of the tag.
+        if hasattr(elem, 'tail') and elem.tail:
+            tail = elem.tail
+            if not self.in_pre:
+                if self.in_table:
+                    tail = self.remove_newlines(tail)
+                else:
+                    tail = self.remove_leading_ws(tail)
+            text.append(tail)
+
+        return text
diff --git a/src/calibre/ebooks/txt/unsmarten.py b/src/calibre/ebooks/txt/unsmarten.py
new file mode 100644
index 0000000000..30a22bf069
--- /dev/null
+++ b/src/calibre/ebooks/txt/unsmarten.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+
+'''
+
+'''
+
+__version__ = '0.1'
+__author__ = 'Leigh Parry'
+
+import re
+
+def unsmarten(txt):
+    txt = re.sub(u'&#8211;|&ndash;|–', r'-', txt) # en-dash
+    txt = re.sub(u'&#8212;|&mdash;|—', r'--', txt) # em-dash
+    txt = re.sub(u'&#8230;|&hellip;|…', r'...', txt) # ellipsis
+
+    txt = re.sub(u'&#8220;|&#8221;|&#8243;|&ldquo;|&rdquo;|&Prime;|“|”|″', r'"', txt)  # double quote
+    txt = re.sub(u'(["\'‘“]|\s)’', r"\1{'/}", txt)  # apostrophe
+    txt = re.sub(u'&#8216;|&#8217;|&#8242;|&lsquo;|&rsquo;|&prime;|‘|’|′', r"'", txt)  # single quote
+
+    txt = re.sub(u'&#162;|&cent;|¢',     r'{c\}',  txt)  # cent
+    txt = re.sub(u'&#163;|&pound;|£',    r'{L-}',  txt)  # pound
+    txt = re.sub(u'&#165;|&yen;|¥',      r'{Y=}',  txt)  # yen
+    txt = re.sub(u'&#169;|&copy;|©',     r'{(c)}', txt)  # copyright
+    txt = re.sub(u'&#174;|&reg;|®',      r'{(r)}', txt)  # registered
+    txt = re.sub(u'&#188;|&frac14;|¼',   r'{1/4}', txt)  # quarter
+    txt = re.sub(u'&#189;|&frac12;|½',   r'{1/2}', txt)  # half
+    txt = re.sub(u'&#190;|&frac34;|¾',   r'{3/4}', txt)  # three-quarter
+    txt = re.sub(u'&#192;|&Agrave;|À',   r'{A`)}', txt)  # A-grave
+    txt = re.sub(u'&#193;|&Aacute;|Á',   r"{A'}",  txt)  # A-acute
+    txt = re.sub(u'&#194;|&Acirc;|Â',    r'{A^}', txt)  # A-circumflex
+    txt = re.sub(u'&#195;|&Atilde;|Ã',   r'{A~}',  txt)  # A-tilde
+    txt = re.sub(u'&#196;|&Auml;|Ä',     r'{A"}',  txt)  # A-umlaut
+    txt = re.sub(u'&#197;|&Aring;|Å',    r'{Ao}',  txt)  # A-ring
+    txt = re.sub(u'&#198;|&AElig;|Æ',    r'{AE}',  txt)  # AE
+    txt = re.sub(u'&#199;|&Ccedil;|Ç',   r'{C,}',  txt)  # C-cedilla
+    txt = re.sub(u'&#200;|&Egrave;|È',   r'{E`}',  txt)  # E-grave
+    txt = re.sub(u'&#201;|&Eacute;|É',   r"{E'}",  txt)  # E-acute
+    txt = re.sub(u'&#202;|&Ecirc;|Ê',    r'{E^}', txt)  # E-circumflex
+    txt = re.sub(u'&#203;|&Euml;|Ë',     r'{E"}',  txt)  # E-umlaut
+    txt = re.sub(u'&#204;|&Igrave;|Ì',   r'{I`}',  txt)  # I-grave
+    txt = re.sub(u'&#205;|&Iacute;|Í',   r"{I'}",  txt)  # I-acute
+    txt = re.sub(u'&#206;|&Icirc;|Î',    r'{I^}', txt)  # I-circumflex
+    txt = re.sub(u'&#207;|&Iuml;|Ï',     r'{I"}',  txt)  # I-umlaut
+    txt = re.sub(u'&#208;|&ETH;|Ð',      r'{D-}',  txt)  # ETH
+    txt = re.sub(u'&#209;|&Ntilde;|Ñ',   r'{N~}',  txt)  # N-tilde
+    txt = re.sub(u'&#210;|&Ograve;|Ò',   r'{O`}',  txt)  # O-grave
+    txt = re.sub(u'&#211;|&Oacute;|Ó',   r"{O'}",  txt)  # O-acute
+    txt = re.sub(u'&#212;|&Ocirc;|Ô',    r'{O^}', txt)  # O-circumflex
+    txt = re.sub(u'&#213;|&Otilde;|Õ',   r'{O~}',  txt)  # O-tilde
+    txt = re.sub(u'&#214;|&Ouml;|Ö',     r'{O"}',  txt)  # O-umlaut
+    txt = re.sub(u'&#215;|&times;|×',    r'{x}',   txt)  # dimension
+    txt = re.sub(u'&#216;|&Oslash;|Ø',   r'{O/}',  txt)  # O-slash
+    txt = re.sub(u'&#217;|&Ugrave;|Ù',   r"{U`}",  txt)  # U-grave
+    txt = re.sub(u'&#218;|&Uacute;|Ú',   r"{U'}",  txt)  # U-acute
+    txt = re.sub(u'&#219;|&Ucirc;|Û',    r'{U^}', txt)  # U-circumflex
+    txt = re.sub(u'&#220;|&Uuml;|Ü',     r'{U"}',  txt)  # U-umlaut
+    txt = re.sub(u'&#221;|&Yacute;|Ý',   r"{Y'}",  txt)  # Y-grave
+    txt = re.sub(u'&#223;|&szlig;|ß',    r'{sz}',  txt)  # sharp-s
+    txt = re.sub(u'&#224;|&agrave;|à',   r'{a`}',  txt)  # a-grave
+    txt = re.sub(u'&#225;|&aacute;|á',   r"{a'}",  txt)  # a-acute
+    txt = re.sub(u'&#226;|&acirc;|â',    r'{a^}', txt)  # a-circumflex
+    txt = re.sub(u'&#227;|&atilde;|ã',   r'{a~}',  txt)  # a-tilde
+    txt = re.sub(u'&#228;|&auml;|ä',     r'{a"}',  txt)  # a-umlaut
+    txt = re.sub(u'&#229;|&aring;|å',    r'{ao}',  txt)  # a-ring
+    txt = re.sub(u'&#230;|&aelig;|æ',    r'{ae}',  txt)  # ae
+    txt = re.sub(u'&#231;|&ccedil;|ç',   r'{c,}',  txt)  # c-cedilla
+    txt = re.sub(u'&#232;|&egrave;|è',   r'{e`}',  txt)  # e-grave
+    txt = re.sub(u'&#233;|&eacute;|é',   r"{e'}",  txt)  # e-acute
+    txt = re.sub(u'&#234;|&ecirc;|ê',    r'{e^}', txt)  # e-circumflex
+    txt = re.sub(u'&#235;|&euml;|ë',     r'{e"}',  txt)  # e-umlaut
+    txt = re.sub(u'&#236;|&igrave;|ì',   r'{i`}',  txt)  # i-grave
+    txt = re.sub(u'&#237;|&iacute;|í',   r"{i'}",  txt)  # i-acute
+    txt = re.sub(u'&#238;|&icirc;|î',    r'{i^}', txt)  # i-circumflex
+    txt = re.sub(u'&#239;|&iuml;|ï',     r'{i"}',  txt)  # i-umlaut
+    txt = re.sub(u'&#240;|&eth;|ð',      r'{d-}',  txt)  # eth
+    txt = re.sub(u'&#241;|&ntilde;|ñ',   r'{n~}',  txt)  # n-tilde
+    txt = re.sub(u'&#242;|&ograve;|ò',   r'{o`}',  txt)  # o-grave
+    txt = re.sub(u'&#243;|&oacute;|ó',   r"{o'}",  txt)  # o-acute
+    txt = re.sub(u'&#244;|&ocirc;|ô',    r'{o^}', txt)  # o-circumflex
+    txt = re.sub(u'&#245;|&otilde;|õ',   r'{o~}',  txt)  # o-tilde
+    txt = re.sub(u'&#246;|&ouml;|ö',     r'{o"}',  txt)  # o-umlaut
+    txt = re.sub(u'&#248;|&oslash;|ø',   r'{o/}',  txt)  # o-stroke
+    txt = re.sub(u'&#249;|&ugrave;|ù',   r'{u`}',  txt)  # u-grave
+    txt = re.sub(u'&#250;|&uacute;|ú',   r"{u'}",  txt)  # u-acute
+    txt = re.sub(u'&#251;|&ucirc;|û',    r'{u^}', txt)  # u-circumflex
+    txt = re.sub(u'&#252;|&uuml;|ü',     r'{u"}',  txt)  # u-umlaut
+    txt = re.sub(u'&#253;|&yacute;|ý',   r"{y'}",  txt)  # y-acute
+    txt = re.sub(u'&#255;|&yuml;|ÿ',     r'{y"}',  txt)  # y-umlaut
+    txt = re.sub(u'&#338;|&OElig;|Œ',    r'{OE}',  txt)  # OE
+    txt = re.sub(u'&#339;|&oelig;|œ',    r'{oe}',  txt)  # oe
+    txt = re.sub(u'&#348;|&Scaron;|Ŝ',   r'{S^}', txt)  # Scaron
+    txt = re.sub(u'&#349;|&scaron;|ŝ',   r'{s^}', txt)  # scaron
+    txt = re.sub(u'&#8226;|&bull;|•',    r'{*}',   txt)  # bullet
+    txt = re.sub(u'&#8355;|₣',           r'{Fr}',  txt)  # Franc
+    txt = re.sub(u'&#8356;|₤',           r'{L=}',  txt)  # Lira
+    txt = re.sub(u'&#8360;|₨',           r'{Rs}',  txt)  # Rupee
+    txt = re.sub(u'&#8364;|&euro;|€',    r'{C=}',  txt)  # euro
+    txt = re.sub(u'&#8482;|&trade;|™',   r'{tm}',  txt)  # trademark
+    txt = re.sub(u'&#9824;|&spades;|♠',  r'{spade}',   txt)  # spade
+    txt = re.sub(u'&#9827;|&clubs;|♣',   r'{club}',    txt)  # club
+    txt = re.sub(u'&#9829;|&hearts;|♥',  r'{heart}',   txt)  # heart
+    txt = re.sub(u'&#9830;|&diams;|♦',   r'{diamond}', txt)  # diamond
+
+    txt = re.sub(u'\xa0',   r'p. ', txt)              # blank paragraph
+    txt = re.sub(u'\n\n\n\n',   r'\n\np. \n\n', txt)  # blank paragraph
+    txt = re.sub(u'\n  \n',   r'\n<br />\n', txt)     # blank paragraph - br tag
+
+    return txt