Error checking for CSS (stylesheets and inside <style> tags

This commit is contained in:
Kovid Goyal 2013-12-10 22:20:57 +05:30
parent 6897a97342
commit ed50bc7ed5
3 changed files with 104 additions and 6 deletions

View File

@ -8,11 +8,11 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from future_builtins import map
from calibre.ebooks.oeb.base import OEB_DOCS
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.polish.cover import is_raster_image
from calibre.ebooks.oeb.polish.check.base import run_checkers
from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing
from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing, check_css_parsing, fix_style_tag
from calibre.ebooks.oeb.polish.check.images import check_raster_images
from calibre.ebooks.oeb.polish.check.links import check_links
@ -23,13 +23,15 @@ def run_checks(container):
errors = []
# Check parsing
xml_items, html_items, raster_images = [], [], []
xml_items, html_items, raster_images, stylesheets = [], [], [], []
for name, mt in container.mime_map.iteritems():
items = None
if mt in XML_TYPES:
items = xml_items
elif mt in OEB_DOCS:
items = html_items
elif mt in OEB_STYLES:
items = stylesheets
elif is_raster_image(mt):
items = raster_images
if items is not None:
@ -38,6 +40,15 @@ def run_checks(container):
errors.extend(run_checkers(check_xml_parsing, html_items))
errors.extend(run_checkers(check_raster_images, raster_images))
# cssutils is not thread safe
for name, mt, raw in stylesheets:
errors.extend(check_css_parsing(name, raw))
for name, mt, raw in html_items:
root = container.parsed(name)
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css':
errors.extend(check_css_parsing(name, style.text, line_offset=style.sourceline - 1))
errors += check_links(container)
return errors
@ -46,8 +57,13 @@ def fix_errors(container, errors):
# Fix parsing
changed = False
for name in {e.name for e in errors if getattr(e, 'is_parsing_error', False)}:
container.parsed(name)
root = container.parsed(name)
container.dirty(name)
if container.mime_map[name] in OEB_DOCS:
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text and style.text.strip():
fix_style_tag(container, style)
changed = True
for err in errors:

View File

@ -9,10 +9,12 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
import cssutils
from calibre import force_unicode
from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.utils import PositionFinder
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR
from calibre.ebooks.oeb.base import OEB_DOCS
HTML_ENTITTIES = frozenset(html5_entities)
@ -21,6 +23,15 @@ ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
def fix_style_tag(container, style):
prev = style.getprevious()
ws = style.getparent().text if prev is None else prev.tail
ws = ws.splitlines()[-1]
indent = ws[len(ws.rstrip()):]
sheet = container.parse_css(style.text)
style.text = '\n' + force_unicode(sheet.cssText, 'utf-8') + '\n' + indent
class XMLParseError(BaseError):
is_parsing_error = True
@ -131,3 +142,73 @@ def check_xml_parsing(name, mt, raw):
return errors
class CSSError(BaseError):
is_parsing_error = True
def __init__(self, level, msg, name, line, col):
self.level = level
prefix = 'CSS: '
BaseError.__init__(self, prefix + msg, name, line, col)
if level == WARN:
self.HELP = _('This CSS construct is not recognized. That means that it'
' most likely will not work on reader devices. Consider'
' replacing it with something else.')
else:
self.HELP = _('Some reader programs are very'
' finicky about CSS stylesheets and will ignore the whole'
' sheet if there is an error. These errors can often'
' be fixed automatically, however, automatic fixing will'
' typically remove unrecognized items, instead of correcting them.')
self.INDIVIDUAL_FIX = _('Try to fix parsing errors in this stylesheet automatically')
def __call__(self, container):
root = container.parsed(self.name)
container.dirty(self.name)
if container.mime_map[self.name] in OEB_DOCS:
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text and style.text.strip():
fix_style_tag(container, style)
return True
pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)'))
class ErrorHandler(object):
' Replacement logger to get useful error/warning info out of cssutils during parsing '
def __init__(self, name):
# may be disabled during setting of known valid items
self.name = name
self.errors = []
def __noop(self, *args, **kwargs):
pass
info = debug = setLevel = getEffectiveLevel = addHandler = removeHandler = __noop
def __handle(self, level, *args):
msg = ' '.join(map(unicode, args))
line = col = None
for pat in pos_pats:
m = pat.search(msg)
if m is not None:
line, col = int(m.group(1)), int(m.group(2))
if msg and line is not None:
# Ignore error messages with no line numbers as these are usually
# summary messages for an underlying error with a line number
self.errors.append(CSSError(level, msg, self.name, line, col))
def error(self, *args):
self.__handle(ERROR, *args)
def warn(self, *args):
self.__handle(WARN, *args)
warning = warn
def check_css_parsing(name, raw, line_offset=0):
log = ErrorHandler(name)
parser = cssutils.CSSParser(fetcher=lambda x: (None, None), log=log)
parser.parseString(raw, validate=True)
for err in log.errors:
err.line += line_offset
return log.errors

View File

@ -434,6 +434,7 @@ class Container(object): # {{{
from cssutils import CSSParser, log
log.setLevel(logging.WARN)
log.raiseExceptions = False
if isinstance(data, bytes):
data = self.decode(data)
if not self.tweak_mode:
data = self.css_preprocessor(data)