mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: Fix regression in 0.7.46 that caused loss of some CSS information when converting HTML produced by Microsoft Word. Also remove empty tags from microsoft namespaces when parsing HTML
This commit is contained in:
parent
16ab1b8dbb
commit
27412b5b5c
@ -265,16 +265,28 @@ class CSSPreProcessor(object):
|
||||
|
||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||
# Remove some of the broken CSS Microsoft products
|
||||
# create, slightly dangerous as it removes to end of line
|
||||
# rather than semi-colon
|
||||
MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$',
|
||||
re.MULTILINE|re.IGNORECASE)
|
||||
# create
|
||||
MS_PAT = re.compile(r'''
|
||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||
(%s).+? # The invalid selectors
|
||||
(?P<end>$|;|\}) # The end of the declaration
|
||||
'''%'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||
|
||||
def ms_sub(self, match):
|
||||
end = match.group('end')
|
||||
try:
|
||||
start = match.group('start')
|
||||
except:
|
||||
start = ''
|
||||
if end == ';':
|
||||
end = ''
|
||||
return start + end
|
||||
|
||||
def __call__(self, data, add_namespace=False):
|
||||
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
||||
data = self.PAGE_PAT.sub('', data)
|
||||
if '\n' in data:
|
||||
data = self.MS_PAT.sub('', data)
|
||||
data = self.MS_PAT.sub(self.ms_sub, data)
|
||||
if not add_namespace:
|
||||
return data
|
||||
ans, namespaced = [], False
|
||||
|
@ -827,6 +827,24 @@ class Manifest(object):
|
||||
return None
|
||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
|
||||
def clean_word_doc(self, data):
|
||||
prefixes = []
|
||||
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
||||
prefixes.append(match.group(1))
|
||||
if prefixes:
|
||||
self.oeb.log.warn('Found microsoft markup, cleaning...')
|
||||
# Remove empty tags as they are not rendered by browsers
|
||||
# but can become renderable HTML tags like <p/> if the
|
||||
# document is parsed by an HTML parser
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
|
||||
re.DOTALL)
|
||||
data = pat.sub('', data)
|
||||
pat = re.compile(
|
||||
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
|
||||
data = pat.sub('', data)
|
||||
return data
|
||||
|
||||
def _parse_xhtml(self, data):
|
||||
self.oeb.log.debug('Parsing', self.href, '...')
|
||||
# Convert to Unicode and normalize line endings
|
||||
@ -884,6 +902,10 @@ class Manifest(object):
|
||||
except etree.XMLSyntaxError:
|
||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
return data
|
||||
try:
|
||||
data = self.clean_word_doc(data)
|
||||
except:
|
||||
pass
|
||||
data = first_pass(data)
|
||||
|
||||
# Handle weird (non-HTML/fragment) files
|
||||
@ -907,6 +929,7 @@ class Manifest(object):
|
||||
parent.append(child)
|
||||
data = nroot
|
||||
|
||||
|
||||
# Force into the XHTML namespace
|
||||
if not namespace(data.tag):
|
||||
self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
|
||||
|
@ -423,6 +423,7 @@ class Stylizer(object):
|
||||
|
||||
class Style(object):
|
||||
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
|
||||
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
|
||||
|
||||
def __init__(self, element, stylizer):
|
||||
self._element = element
|
||||
@ -447,6 +448,8 @@ class Style(object):
|
||||
return
|
||||
css = attrib['style'].split(';')
|
||||
css = filter(None, (x.strip() for x in css))
|
||||
css = [x.strip() for x in css]
|
||||
css = [x for x in css if self.MS_PAT.match(x) is None]
|
||||
try:
|
||||
style = CSSStyleDeclaration('; '.join(css))
|
||||
except CSSSyntaxError:
|
||||
|
Loading…
x
Reference in New Issue
Block a user