Conversion pipeline: Fix regression in 0.7.46 that caused loss of some CSS information when converting HTML produced by Microsoft Word. Also remove empty tags from microsoft namespaces when parsing HTML

2025-07-09 03:04:10 -04:00 · 2011-03-01 11:43:00 -07:00 · 2011-03-01 11:43:00 -07:00 · 27412b5b5c
commit 27412b5b5c
parent 16ab1b8dbb
3 changed files with 44 additions and 6 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -265,16 +265,28 @@ class CSSPreProcessor(object):

    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
    # Remove some of the broken CSS Microsoft products
-    # create, slightly dangerous as it removes to end of line
-    # rather than semi-colon
-    MS_PAT     = re.compile(r'^\s*(mso-|panose-).+?$',
-            re.MULTILINE|re.IGNORECASE)
+    # create
+    MS_PAT     = re.compile(r'''
+        (?P<start>^|;|\{)\s*    # The end of the previous rule or block start
+        (%s).+?                 # The invalid selectors
+        (?P<end>$|;|\})         # The end of the declaration
+        '''%'mso-|panose-|text-underline|tab-interval',
+        re.MULTILINE|re.IGNORECASE|re.VERBOSE)
+
+    def ms_sub(self, match):
+        end = match.group('end')
+        try:
+            start = match.group('start')
+        except:
+            start = ''
+        if end == ';':
+            end = ''
+        return start + end

    def __call__(self, data, add_namespace=False):
        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
        data = self.PAGE_PAT.sub('', data)
-        if '\n' in data:
-            data = self.MS_PAT.sub('', data)
+        data = self.MS_PAT.sub(self.ms_sub, data)
        if not add_namespace:
            return data
        ans, namespaced = [], False
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -827,6 +827,24 @@ class Manifest(object):
                return None
            return etree.fromstring(data, parser=RECOVER_PARSER)

+        def clean_word_doc(self, data):
+            prefixes = []
+            for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
+                prefixes.append(match.group(1))
+            if prefixes:
+                self.oeb.log.warn('Found microsoft markup, cleaning...')
+                # Remove empty tags as they are not rendered by browsers
+                # but can become renderable HTML tags like <p/> if the
+                # document is parsed by an HTML parser
+                pat = re.compile(
+                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
+                        re.DOTALL)
+                data = pat.sub('', data)
+                pat = re.compile(
+                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
+                data = pat.sub('', data)
+            return data
+
        def _parse_xhtml(self, data):
            self.oeb.log.debug('Parsing', self.href, '...')
            # Convert to Unicode and normalize line endings
@ -884,6 +902,10 @@ class Manifest(object):
                        except etree.XMLSyntaxError:
                            data = etree.fromstring(data, parser=RECOVER_PARSER)
                return data
+            try:
+                data = self.clean_word_doc(data)
+            except:
+                pass
            data = first_pass(data)

            # Handle weird (non-HTML/fragment) files
@ -907,6 +929,7 @@ class Manifest(object):
                    parent.append(child)
                data = nroot

+
            # Force into the XHTML namespace
            if not namespace(data.tag):
                self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -423,6 +423,7 @@ class Stylizer(object):

 class Style(object):
    UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
+    MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')

    def __init__(self, element, stylizer):
        self._element = element
@ -447,6 +448,8 @@ class Style(object):
            return
        css = attrib['style'].split(';')
        css = filter(None, (x.strip() for x in css))
+        css = [x.strip() for x in css]
+        css = [x for x in css if self.MS_PAT.match(x) is None]
        try:
            style = CSSStyleDeclaration('; '.join(css))
        except CSSSyntaxError: