From 27412b5b5c7a4926c487895a048566cbffc1beae Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 1 Mar 2011 11:43:00 -0700
Subject: [PATCH] Conversion pipeline: Fix regression in 0.7.46 that caused
 loss of some CSS information when converting HTML produced by Microsoft Word.
 Also remove empty tags from microsoft namespaces when parsing HTML

---
 src/calibre/ebooks/conversion/preprocess.py | 24 +++++++++++++++------
 src/calibre/ebooks/oeb/base.py              | 23 ++++++++++++++++++++
 src/calibre/ebooks/oeb/stylizer.py          |  3 +++
 3 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 5f6402f746..a1d5fa94d8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -265,16 +265,28 @@ class CSSPreProcessor(object):
 
     PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
     # Remove some of the broken CSS Microsoft products
-    # create, slightly dangerous as it removes to end of line
-    # rather than semi-colon
-    MS_PAT     = re.compile(r'^\s*(mso-|panose-).+?$',
-            re.MULTILINE|re.IGNORECASE)
+    # create
+    MS_PAT     = re.compile(r'''
+        (?P<start>^|;|\{)\s*    # The end of the previous rule or block start
+        (%s).+?                 # The invalid selectors
+        (?P<end>$|;|\})         # The end of the declaration
+        '''%'mso-|panose-|text-underline|tab-interval',
+        re.MULTILINE|re.IGNORECASE|re.VERBOSE)
+
+    def ms_sub(self, match):
+        end = match.group('end')
+        try:
+            start = match.group('start')
+        except:
+            start = ''
+        if end == ';':
+            end = ''
+        return start + end
 
     def __call__(self, data, add_namespace=False):
         from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
         data = self.PAGE_PAT.sub('', data)
-        if '\n' in data:
-            data = self.MS_PAT.sub('', data)
+        data = self.MS_PAT.sub(self.ms_sub, data)
         if not add_namespace:
             return data
         ans, namespaced = [], False
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index ccc452f1f8..7e99916fc3 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -827,6 +827,24 @@ class Manifest(object):
                 return None
             return etree.fromstring(data, parser=RECOVER_PARSER)
 
+        def clean_word_doc(self, data):
+            prefixes = []
+            for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
+                prefixes.append(match.group(1))
+            if prefixes:
+                self.oeb.log.warn('Found microsoft markup, cleaning...')
+                # Remove empty tags as they are not rendered by browsers
+                # but can become renderable HTML tags like <p/> if the
+                # document is parsed by an HTML parser
+                pat = re.compile(
+                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
+                        re.DOTALL)
+                data = pat.sub('', data)
+                pat = re.compile(
+                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
+                data = pat.sub('', data)
+            return data
+
         def _parse_xhtml(self, data):
             self.oeb.log.debug('Parsing', self.href, '...')
             # Convert to Unicode and normalize line endings
@@ -884,6 +902,10 @@ class Manifest(object):
                         except etree.XMLSyntaxError:
                             data = etree.fromstring(data, parser=RECOVER_PARSER)
                 return data
+            try:
+                data = self.clean_word_doc(data)
+            except:
+                pass
             data = first_pass(data)
 
             # Handle weird (non-HTML/fragment) files
@@ -907,6 +929,7 @@ class Manifest(object):
                     parent.append(child)
                 data = nroot
 
+
             # Force into the XHTML namespace
             if not namespace(data.tag):
                 self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py
index 849d161228..efc8fe1463 100644
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@@ -423,6 +423,7 @@ class Stylizer(object):
 
 class Style(object):
     UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
+    MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
 
     def __init__(self, element, stylizer):
         self._element = element
@@ -447,6 +448,8 @@ class Style(object):
             return
         css = attrib['style'].split(';')
         css = filter(None, (x.strip() for x in css))
+        css = [x.strip() for x in css]
+        css = [x for x in css if self.MS_PAT.match(x) is None]
         try:
             style = CSSStyleDeclaration('; '.join(css))
         except CSSSyntaxError: