From 27412b5b5c7a4926c487895a048566cbffc1beae Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Mar 2011 11:43:00 -0700 Subject: [PATCH] Conversion pipeline: Fix regression in 0.7.46 that caused loss of some CSS information when converting HTML produced by Microsoft Word. Also remove empty tags from microsoft namespaces when parsing HTML --- src/calibre/ebooks/conversion/preprocess.py | 24 +++++++++++++++------ src/calibre/ebooks/oeb/base.py | 23 ++++++++++++++++++++ src/calibre/ebooks/oeb/stylizer.py | 3 +++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 5f6402f746..a1d5fa94d8 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -265,16 +265,28 @@ class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') # Remove some of the broken CSS Microsoft products - # create, slightly dangerous as it removes to end of line - # rather than semi-colon - MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$', - re.MULTILINE|re.IGNORECASE) + # create + MS_PAT = re.compile(r''' + (?P^|;|\{)\s* # The end of the previous rule or block start + (%s).+? # The invalid selectors + (?P$|;|\}) # The end of the declaration + '''%'mso-|panose-|text-underline|tab-interval', + re.MULTILINE|re.IGNORECASE|re.VERBOSE) + + def ms_sub(self, match): + end = match.group('end') + try: + start = match.group('start') + except: + start = '' + if end == ';': + end = '' + return start + end def __call__(self, data, add_namespace=False): from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE data = self.PAGE_PAT.sub('', data) - if '\n' in data: - data = self.MS_PAT.sub('', data) + data = self.MS_PAT.sub(self.ms_sub, data) if not add_namespace: return data ans, namespaced = [], False diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index ccc452f1f8..7e99916fc3 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -827,6 +827,24 @@ class Manifest(object): return None return etree.fromstring(data, parser=RECOVER_PARSER) + def clean_word_doc(self, data): + prefixes = [] + for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data): + prefixes.append(match.group(1)) + if prefixes: + self.oeb.log.warn('Found microsoft markup, cleaning...') + # Remove empty tags as they are not rendered by browsers + # but can become renderable HTML tags like

if the + # document is parsed by an HTML parser + pat = re.compile( + r'<(%s):([a-zA-Z0-9]+)[^>/]*?>'%('|'.join(prefixes)), + re.DOTALL) + data = pat.sub('', data) + pat = re.compile( + r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes))) + data = pat.sub('', data) + return data + def _parse_xhtml(self, data): self.oeb.log.debug('Parsing', self.href, '...') # Convert to Unicode and normalize line endings @@ -884,6 +902,10 @@ class Manifest(object): except etree.XMLSyntaxError: data = etree.fromstring(data, parser=RECOVER_PARSER) return data + try: + data = self.clean_word_doc(data) + except: + pass data = first_pass(data) # Handle weird (non-HTML/fragment) files @@ -907,6 +929,7 @@ class Manifest(object): parent.append(child) data = nroot + # Force into the XHTML namespace if not namespace(data.tag): self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace') diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 849d161228..efc8fe1463 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -423,6 +423,7 @@ class Stylizer(object): class Style(object): UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$') + MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)') def __init__(self, element, stylizer): self._element = element @@ -447,6 +448,8 @@ class Style(object): return css = attrib['style'].split(';') css = filter(None, (x.strip() for x in css)) + css = [x.strip() for x in css] + css = [x for x in css if self.MS_PAT.match(x) is None] try: style = CSSStyleDeclaration('; '.join(css)) except CSSSyntaxError: