mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Conversion pipeline: Discard broken CSS generated by Microsoft products before parsing. Should improve performance when converting HTML from Word in particular.
This commit is contained in:
parent
b43c7d45a3
commit
7ec5194d9b
@ -264,10 +264,16 @@ class Dehyphenator(object):
|
|||||||
class CSSPreProcessor(object):
|
class CSSPreProcessor(object):
|
||||||
|
|
||||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||||
|
# Remove some of the broken CSS Microsoft products
|
||||||
|
# create
|
||||||
|
MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$',
|
||||||
|
re.MULTILINE|re.IGNORECASE)
|
||||||
|
|
||||||
def __call__(self, data, add_namespace=False):
|
def __call__(self, data, add_namespace=False):
|
||||||
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
||||||
data = self.PAGE_PAT.sub('', data)
|
data = self.PAGE_PAT.sub('', data)
|
||||||
|
if '\n' in data:
|
||||||
|
data = self.MS_PAT.sub('', data)
|
||||||
if not add_namespace:
|
if not add_namespace:
|
||||||
return data
|
return data
|
||||||
ans, namespaced = [], False
|
ans, namespaced = [], False
|
||||||
|
Loading…
x
Reference in New Issue
Block a user