From 7ec5194d9bc06896537f1777c09a397b9c6f13b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 17 Feb 2011 15:47:14 -0700 Subject: [PATCH] Conversion pipeline: Discard broken CSS generated by Microsoft products before parsing. Should improve performance when converting HTML from Word in particular. --- src/calibre/ebooks/conversion/preprocess.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 0adcb88cfd..14bfceac40 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -264,10 +264,16 @@ class Dehyphenator(object): class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') + # Remove some of the broken CSS Microsoft products + # create + MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$', + re.MULTILINE|re.IGNORECASE) def __call__(self, data, add_namespace=False): from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE data = self.PAGE_PAT.sub('', data) + if '\n' in data: + data = self.MS_PAT.sub('', data) if not add_namespace: return data ans, namespaced = [], False