diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py index ad2c33d55d..16c1a7dafb 100644 --- a/src/calibre/ebooks/oeb/polish/check/main.py +++ b/src/calibre/ebooks/oeb/polish/check/main.py @@ -8,11 +8,11 @@ __copyright__ = '2013, Kovid Goyal ' from future_builtins import map -from calibre.ebooks.oeb.base import OEB_DOCS +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES from calibre.ebooks.oeb.polish.container import guess_type from calibre.ebooks.oeb.polish.cover import is_raster_image from calibre.ebooks.oeb.polish.check.base import run_checkers -from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing +from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing, check_css_parsing, fix_style_tag from calibre.ebooks.oeb.polish.check.images import check_raster_images from calibre.ebooks.oeb.polish.check.links import check_links @@ -23,13 +23,15 @@ def run_checks(container): errors = [] # Check parsing - xml_items, html_items, raster_images = [], [], [] + xml_items, html_items, raster_images, stylesheets = [], [], [], [] for name, mt in container.mime_map.iteritems(): items = None if mt in XML_TYPES: items = xml_items elif mt in OEB_DOCS: items = html_items + elif mt in OEB_STYLES: + items = stylesheets elif is_raster_image(mt): items = raster_images if items is not None: @@ -38,6 +40,15 @@ def run_checks(container): errors.extend(run_checkers(check_xml_parsing, html_items)) errors.extend(run_checkers(check_raster_images, raster_images)) + # cssutils is not thread safe + for name, mt, raw in stylesheets: + errors.extend(check_css_parsing(name, raw)) + for name, mt, raw in html_items: + root = container.parsed(name) + for style in root.xpath('//*[local-name()="style"]'): + if style.get('type', 'text/css') == 'text/css': + errors.extend(check_css_parsing(name, style.text, line_offset=style.sourceline - 1)) + errors += check_links(container) return errors @@ -46,8 +57,13 @@ def fix_errors(container, errors): # Fix parsing changed = False for name in {e.name for e in errors if getattr(e, 'is_parsing_error', False)}: - container.parsed(name) + root = container.parsed(name) container.dirty(name) + if container.mime_map[name] in OEB_DOCS: + for style in root.xpath('//*[local-name()="style"]'): + if style.get('type', 'text/css') == 'text/css' and style.text and style.text.strip(): + fix_style_tag(container, style) + changed = True for err in errors: diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index bd351b5e86..c7b627119a 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -9,10 +9,12 @@ __copyright__ = '2013, Kovid Goyal ' import re from lxml.etree import XMLParser, fromstring, XMLSyntaxError +import cssutils +from calibre import force_unicode from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.oeb.polish.utils import PositionFinder -from calibre.ebooks.oeb.polish.check.base import BaseError, WARN +from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR from calibre.ebooks.oeb.base import OEB_DOCS HTML_ENTITTIES = frozenset(html5_entities) @@ -21,6 +23,15 @@ ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES)))) +def fix_style_tag(container, style): + prev = style.getprevious() + ws = style.getparent().text if prev is None else prev.tail + ws = ws.splitlines()[-1] + indent = ws[len(ws.rstrip()):] + + sheet = container.parse_css(style.text) + style.text = '\n' + force_unicode(sheet.cssText, 'utf-8') + '\n' + indent + class XMLParseError(BaseError): is_parsing_error = True @@ -131,3 +142,73 @@ def check_xml_parsing(name, mt, raw): return errors +class CSSError(BaseError): + + is_parsing_error = True + + def __init__(self, level, msg, name, line, col): + self.level = level + prefix = 'CSS: ' + BaseError.__init__(self, prefix + msg, name, line, col) + if level == WARN: + self.HELP = _('This CSS construct is not recognized. That means that it' + ' most likely will not work on reader devices. Consider' + ' replacing it with something else.') + else: + self.HELP = _('Some reader programs are very' + ' finicky about CSS stylesheets and will ignore the whole' + ' sheet if there is an error. These errors can often' + ' be fixed automatically, however, automatic fixing will' + ' typically remove unrecognized items, instead of correcting them.') + self.INDIVIDUAL_FIX = _('Try to fix parsing errors in this stylesheet automatically') + + def __call__(self, container): + root = container.parsed(self.name) + container.dirty(self.name) + if container.mime_map[self.name] in OEB_DOCS: + for style in root.xpath('//*[local-name()="style"]'): + if style.get('type', 'text/css') == 'text/css' and style.text and style.text.strip(): + fix_style_tag(container, style) + return True + +pos_pats = (re.compile(r'\[(\d+):(\d+)'), re.compile(r'(\d+), (\d+)\)')) + +class ErrorHandler(object): + + ' Replacement logger to get useful error/warning info out of cssutils during parsing ' + + def __init__(self, name): + # may be disabled during setting of known valid items + self.name = name + self.errors = [] + + def __noop(self, *args, **kwargs): + pass + info = debug = setLevel = getEffectiveLevel = addHandler = removeHandler = __noop + + def __handle(self, level, *args): + msg = ' '.join(map(unicode, args)) + line = col = None + for pat in pos_pats: + m = pat.search(msg) + if m is not None: + line, col = int(m.group(1)), int(m.group(2)) + if msg and line is not None: + # Ignore error messages with no line numbers as these are usually + # summary messages for an underlying error with a line number + self.errors.append(CSSError(level, msg, self.name, line, col)) + + def error(self, *args): + self.__handle(ERROR, *args) + + def warn(self, *args): + self.__handle(WARN, *args) + warning = warn + +def check_css_parsing(name, raw, line_offset=0): + log = ErrorHandler(name) + parser = cssutils.CSSParser(fetcher=lambda x: (None, None), log=log) + parser.parseString(raw, validate=True) + for err in log.errors: + err.line += line_offset + return log.errors diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 63dd3e58ce..8105f318fa 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -434,7 +434,8 @@ class Container(object): # {{{ from cssutils import CSSParser, log log.setLevel(logging.WARN) log.raiseExceptions = False - data = self.decode(data) + if isinstance(data, bytes): + data = self.decode(data) if not self.tweak_mode: data = self.css_preprocessor(data) parser = CSSParser(loglevel=logging.WARNING,