Conversion pipeline: Fix regression in 0.7.46 that caused loss of some CSS information when converting HTML produced by Microsoft Word. Also remove empty tags from microsoft namespaces when parsing HTML

2025-07-09 03:04:10 -04:00 · 2011-03-01 11:43:00 -07:00 · 2011-03-01 11:43:00 -07:00 · 27412b5b5c
commit 27412b5b5c
parent 16ab1b8dbb
3 changed files with 44 additions and 6 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -265,16 +265,28 @@ class CSSPreProcessor(object):
    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
    # Remove some of the broken CSS Microsoft products
-    # create, slightly dangerous as it removes to end of line
+    # create
-    # rather than semi-colon
+    MS_PAT     = re.compile(r'''
-    MS_PAT     = re.compile(r'^\s*(mso-|panose-).+?$',
+        (?P<start>^|;|\{)\s*    # The end of the previous rule or block start
-            re.MULTILINE|re.IGNORECASE)
+        (%s).+?                 # The invalid selectors
        (?P<end>$|;|\})         # The end of the declaration
        '''%'mso-|panose-|text-underline|tab-interval',
        re.MULTILINE|re.IGNORECASE|re.VERBOSE)
    def ms_sub(self, match):
        end = match.group('end')
        try:
            start = match.group('start')
        except:
            start = ''
        if end == ';':
            end = ''
        return start + end
    def __call__(self, data, add_namespace=False):
        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
        data = self.PAGE_PAT.sub('', data)
-        if '\n' in data:
+        data = self.MS_PAT.sub(self.ms_sub, data)
            data = self.MS_PAT.sub('', data)
        if not add_namespace:
            return data
        ans, namespaced = [], False
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -827,6 +827,24 @@ class Manifest(object):
                return None
            return etree.fromstring(data, parser=RECOVER_PARSER)
        def clean_word_doc(self, data):
            prefixes = []
            for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
                prefixes.append(match.group(1))
            if prefixes:
                self.oeb.log.warn('Found microsoft markup, cleaning...')
                # Remove empty tags as they are not rendered by browsers
                # but can become renderable HTML tags like <p/> if the
                # document is parsed by an HTML parser
                pat = re.compile(
                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
                        re.DOTALL)
                data = pat.sub('', data)
                pat = re.compile(
                        r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
                data = pat.sub('', data)
            return data
        def _parse_xhtml(self, data):
            self.oeb.log.debug('Parsing', self.href, '...')
            # Convert to Unicode and normalize line endings
@ -884,6 +902,10 @@ class Manifest(object):
                        except etree.XMLSyntaxError:
                            data = etree.fromstring(data, parser=RECOVER_PARSER)
                return data
            try:
                data = self.clean_word_doc(data)
            except:
                pass
            data = first_pass(data)
            # Handle weird (non-HTML/fragment) files
@ -907,6 +929,7 @@ class Manifest(object):
                    parent.append(child)
                data = nroot
            # Force into the XHTML namespace
            if not namespace(data.tag):
                self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
--- a/src/calibre/ebooks/oeb/stylizer.py
+++ b/src/calibre/ebooks/oeb/stylizer.py
@ -423,6 +423,7 @@ class Stylizer(object):
 class Style(object):
    UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
    MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
    def __init__(self, element, stylizer):
        self._element = element
@ -447,6 +448,8 @@ class Style(object):
            return
        css = attrib['style'].split(';')
        css = filter(None, (x.strip() for x in css))
        css = [x.strip() for x in css]
        css = [x for x in css if self.MS_PAT.match(x) is None]
        try:
            style = CSSStyleDeclaration('; '.join(css))
        except CSSSyntaxError: