Conversion pipeline: Fix regression in 0.7.46 that caused loss of some CSS information when converting HTML produced by Microsoft Word. Also remove empty tags from microsoft namespaces when parsing HTML

This commit is contained in:
Kovid Goyal 2011-03-01 11:43:00 -07:00
parent 16ab1b8dbb
commit 27412b5b5c
3 changed files with 44 additions and 6 deletions

View File

@ -265,16 +265,28 @@ class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
# Remove some of the broken CSS Microsoft products
# create, slightly dangerous as it removes to end of line
# rather than semi-colon
MS_PAT = re.compile(r'^\s*(mso-|panose-).+?$',
re.MULTILINE|re.IGNORECASE)
# create
MS_PAT = re.compile(r'''
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
(%s).+? # The invalid selectors
(?P<end>$|;|\}) # The end of the declaration
'''%'mso-|panose-|text-underline|tab-interval',
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
def ms_sub(self, match):
end = match.group('end')
try:
start = match.group('start')
except:
start = ''
if end == ';':
end = ''
return start + end
def __call__(self, data, add_namespace=False):
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
data = self.PAGE_PAT.sub('', data)
if '\n' in data:
data = self.MS_PAT.sub('', data)
data = self.MS_PAT.sub(self.ms_sub, data)
if not add_namespace:
return data
ans, namespaced = [], False

View File

@ -827,6 +827,24 @@ class Manifest(object):
return None
return etree.fromstring(data, parser=RECOVER_PARSER)
def clean_word_doc(self, data):
prefixes = []
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
prefixes.append(match.group(1))
if prefixes:
self.oeb.log.warn('Found microsoft markup, cleaning...')
# Remove empty tags as they are not rendered by browsers
# but can become renderable HTML tags like <p/> if the
# document is parsed by an HTML parser
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
re.DOTALL)
data = pat.sub('', data)
pat = re.compile(
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
data = pat.sub('', data)
return data
def _parse_xhtml(self, data):
self.oeb.log.debug('Parsing', self.href, '...')
# Convert to Unicode and normalize line endings
@ -884,6 +902,10 @@ class Manifest(object):
except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER)
return data
try:
data = self.clean_word_doc(data)
except:
pass
data = first_pass(data)
# Handle weird (non-HTML/fragment) files
@ -907,6 +929,7 @@ class Manifest(object):
parent.append(child)
data = nroot
# Force into the XHTML namespace
if not namespace(data.tag):
self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace')

View File

@ -423,6 +423,7 @@ class Stylizer(object):
class Style(object):
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc)$')
MS_PAT = re.compile(r'^\s*(mso-|panose-|text-underline|tab-interval)')
def __init__(self, element, stylizer):
self._element = element
@ -447,6 +448,8 @@ class Style(object):
return
css = attrib['style'].split(';')
css = filter(None, (x.strip() for x in css))
css = [x.strip() for x in css]
css = [x for x in css if self.MS_PAT.match(x) is None]
try:
style = CSSStyleDeclaration('; '.join(css))
except CSSSyntaxError: