From 60c50f39442b09872fb5aeb98a3be2bea3f4ec56 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 01:46:04 +0800
Subject: [PATCH 1/2] tied mobi into preprocess

---
 src/calibre/ebooks/conversion/utils.py |  5 +++--
 src/calibre/ebooks/mobi/input.py       | 11 ++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..9825585cbf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -236,7 +236,7 @@ class PreProcessor(object):
                     print unicode(self.chapters_with_title)+" chapters with titles"
                 else:
                     html = chapdetect.sub(self.chapter_head, html)
-                    return html
+            return html
 
         recurse_patterns(html, True)
         html = recurse_patterns(html, False)
@@ -322,7 +322,8 @@ class PreProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, & italics tags
+        # Get rid of empty span, bold, font, & italics tags
+        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 584be71fe4..4f3a087065 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class MOBIInput(InputFormatPlugin):
 
@@ -40,10 +41,6 @@ class MOBIInput(InputFormatPlugin):
         return mr.created_opf_path
 
     def heuristics(self, options, html):
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        return html
-
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+        return preprocessor(html)

From d354a085b8e06f3283231a18fecbf2ee775f52bd Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 01:53:49 +0800
Subject: [PATCH 2/2] ...

---
 src/calibre/ebooks/mobi/input.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 4f3a087065..8188027e01 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -5,7 +5,6 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class MOBIInput(InputFormatPlugin):
 
@@ -40,7 +39,3 @@ class MOBIInput(InputFormatPlugin):
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
 
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)