mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Conversion pipeline: Fix regression in 0.7.46 that caused loss of some CSS information when converting HTML produced by Microsoft Word. Also remove empty tags from microsoft namespaces when parsing HTML
This commit is contained in:
parent
16ab1b8dbb
commit
27412b5b5c
@ -265,16 +265,28 @@ class CSSPreProcessor(object):
|
|||||||
|
|
||||||
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
|
||||||
# Remove some of the broken CSS Microsoft products
|
# Remove some of the broken CSS Microsoft products
|
||||||
# create, slightly dangerous as it removes to end of line
|
# create
|
||||||
# rather than semi-colon
|
MS_PAT = re.compile(r'''
|
||||||
MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$',
|
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||||
re.MULTILINE|re.IGNORECASE)
|
(%s).+? # The invalid selectors
|
||||||
|
(?P<end>$|;|\}) # The end of the declaration
|
||||||
|
'''%'mso-|panose-|text-underline|tab-interval',
|
||||||
|
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||||
|
|
||||||
|
def ms_sub(self, match):
|
||||||
|
end = match.group('end')
|
||||||
|
try:
|
||||||
|
start = match.group('start')
|
||||||
|
except:
|
||||||
|
start = ''
|
||||||
|
if end == ';':
|
||||||
|
end = ''
|
||||||
|
return start + end
|
||||||
|
|
||||||
def __call__(self, data, add_namespace=False):
|
def __call__(self, data, add_namespace=False):
|
||||||
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
||||||
data = self.PAGE_PAT.sub('', data)
|
data = self.PAGE_PAT.sub('', data)
|
||||||
if '\n' in data:
|
data = self.MS_PAT.sub(self.ms_sub, data)
|
||||||
data = self.MS_PAT.sub('', data)
|
|
||||||
if not add_namespace:
|
if not add_namespace:
|
||||||
return data
|
return data
|
||||||
ans, namespaced = [], False
|
ans, namespaced = [], False
|
||||||
|
@ -827,6 +827,24 @@ class Manifest(object):
|
|||||||
return None
|
return None
|
||||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
|
|
||||||
|
def clean_word_doc(self, data):
|
||||||
|
prefixes = []
|
||||||
|
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
||||||
|
prefixes.append(match.group(1))
|
||||||
|
if prefixes:
|
||||||
|
self.oeb.log.warn('Found microsoft markup, cleaning...')
|
||||||
|
# Remove empty tags as they are not rendered by browsers
|
||||||
|
# but can become renderable HTML tags like <p/> if the
|
||||||
|
# document is parsed by an HTML parser
|
||||||
|
pat = re.compile(
|
||||||
|
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
|
||||||
|
re.DOTALL)
|
||||||
|
data = pat.sub('', data)
|
||||||
|
pat = re.compile(
|
||||||
|
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
|
||||||
|
data = pat.sub('', data)
|
||||||
|
return data
|
||||||
|
|
||||||
def _parse_xhtml(self, data):
|
def _parse_xhtml(self, data):
|
||||||
self.oeb.log.debug('Parsing', self.href, '...')
|
self.oeb.log.debug('Parsing', self.href, '...')
|
||||||
# Convert to Unicode and normalize line endings
|
# Convert to Unicode and normalize line endings
|
||||||
@ -884,6 +902,10 @@ class Manifest(object):
|
|||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
return data
|
return data
|
||||||
|
try:
|
||||||
|
data = self.clean_word_doc(data)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
data = first_pass(data)
|
data = first_pass(data)
|
||||||
|
|
||||||
# Handle weird (non-HTML/fragment) files
|
# Handle weird (non-HTML/fragment) files
|
||||||
@ -907,6 +929,7 @@ class Manifest(object):
|
|||||||
parent.append(child)
|
parent.append(child)
|
||||||
data = nroot
|
data = nroot
|
||||||
|
|
||||||
|
|
||||||
# Force into the XHTML namespace
|
# Force into the XHTML namespace
|
||||||
if not namespace(data.tag):
|
if not namespace(data.tag):
|
||||||
self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
|
self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')
|
||||||
|
@ -423,6 +423,7 @@ class Stylizer(object):
|
|||||||
|
|
||||||
class Style(object):
|
class Style(object):
|
||||||
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
|
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
|
||||||
|
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
|
||||||
|
|
||||||
def __init__(self, element, stylizer):
|
def __init__(self, element, stylizer):
|
||||||
self._element = element
|
self._element = element
|
||||||
@ -447,6 +448,8 @@ class Style(object):
|
|||||||
return
|
return
|
||||||
css = attrib['style'].split(';')
|
css = attrib['style'].split(';')
|
||||||
css = filter(None, (x.strip() for x in css))
|
css = filter(None, (x.strip() for x in css))
|
||||||
|
css = [x.strip() for x in css]
|
||||||
|
css = [x for x in css if self.MS_PAT.match(x) is None]
|
||||||
try:
|
try:
|
||||||
style = CSSStyleDeclaration('; '.join(css))
|
style = CSSStyleDeclaration('; '.join(css))
|
||||||
except CSSSyntaxError:
|
except CSSSyntaxError:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user