Check Book: Add checks for empty HTML/XML/CSS/image files in the book. Fixes #1329971 [Private bug](https://bugs.launchpad.net/calibre/+bug/1329971)

This commit is contained in:
Kovid Goyal 2014-06-14 10:26:58 +05:30
parent 5ee7a620ea
commit 82f9182f95
3 changed files with 25 additions and 1 deletions

View File

@ -9,6 +9,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre import as_unicode
from calibre.utils.magick import Image
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
from calibre.ebooks.oeb.polish.check.parsing import EmptyFile
class InvalidImage(BaseError):
@ -47,6 +48,8 @@ class CMYKImage(BaseError):
return True
def check_raster_images(name, mt, raw):
if not raw:
return [EmptyFile(name)]
errors = []
i = Image()
try:

View File

@ -13,7 +13,8 @@ from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ebooks.oeb.polish.cover import is_raster_image
from calibre.ebooks.oeb.polish.check.base import run_checkers
from calibre.ebooks.oeb.polish.check.parsing import (
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size, check_ids)
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag,
check_html_size, check_ids, EmptyFile)
from calibre.ebooks.oeb.polish.check.images import check_raster_images
from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations
from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -46,8 +47,14 @@ def run_checks(container):
# cssutils is not thread safe
for name, mt, raw in stylesheets:
if not raw:
errors.append(EmptyFile(name))
continue
errors.extend(check_css_parsing(name, raw))
for name, mt, raw in html_items:
if not raw:
continue
root = container.parsed(name)
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text:

View File

@ -25,6 +25,18 @@ ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
mismatch_pat = re.compile('tag mismatch:.+?line (\d+).+?line \d+')
class EmptyFile(BaseError):
HELP = _('This file is empty, it contains nothing, you should probably remove it.')
INDIVIDUAL_FIX = _('Remove this file')
def __init__(self, name):
BaseError.__init__(self, _('The file %s is empty') % name, name)
def __call__(self, container):
container.remove_item(self.name)
return True
class DecodeError(BaseError):
is_parsing_error = True
@ -197,6 +209,8 @@ def check_html_size(name, mt, raw):
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
def check_xml_parsing(name, mt, raw):
if not raw:
return [EmptyFile(name)]
raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
# Get rid of entities as named entities trip up the XML parser
eproc = EntitityProcessor(mt)