Conversion pipeline: Discard broken CSS generated by Microsoft products before parsing. Should improve performance when converting HTML from Word in particular.

This commit is contained in:
Kovid Goyal 2011-02-17 15:47:14 -07:00
parent b43c7d45a3
commit 7ec5194d9b

View File

@ -264,10 +264,16 @@ class Dehyphenator(object):
class CSSPreProcessor(object): class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
# Remove some of the broken CSS Microsoft products
# create
MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$',
re.MULTILINE|re.IGNORECASE)
def __call__(self, data, add_namespace=False): def __call__(self, data, add_namespace=False):
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
data = self.PAGE_PAT.sub('', data) data = self.PAGE_PAT.sub('', data)
if '\n' in data:
data = self.MS_PAT.sub('', data)
if not add_namespace: if not add_namespace:
return data return data
ans, namespaced = [], False ans, namespaced = [], False