Edit book: Add a check for too large HTML files when running the check book tool. Fixes #1264680 [[Feature Request]: Edit Book 300/100 kb warning on HTML](https://bugs.launchpad.net/calibre/+bug/1264680)

This commit is contained in:
Kovid Goyal 2014-01-01 11:29:03 +05:30
parent 3bfd3bc07f
commit c497dc1097
2 changed files with 20 additions and 3 deletions

View File

@ -12,7 +12,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.polish.cover import is_raster_image
from calibre.ebooks.oeb.polish.check.base import run_checkers
from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing, check_css_parsing, fix_style_tag
from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size
from calibre.ebooks.oeb.polish.check.images import check_raster_images
from calibre.ebooks.oeb.polish.check.links import check_links
from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -37,6 +37,7 @@ def run_checks(container):
items = raster_images
if items is not None:
items.append((name, mt, container.open(name, 'rb').read()))
errors.extend(run_checkers(check_html_size, html_items))
errors.extend(run_checkers(check_xml_parsing, xml_items))
errors.extend(run_checkers(check_xml_parsing, html_items))
errors.extend(run_checkers(check_raster_images, raster_images))

View File

@ -11,11 +11,11 @@ import re
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
import cssutils
from calibre import force_unicode
from calibre import force_unicode, human_readable
from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
from calibre.ebooks.oeb.polish.utils import PositionFinder
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR
from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO
from calibre.ebooks.oeb.base import OEB_DOCS
HTML_ENTITTIES = frozenset(html5_entities)
@ -61,6 +61,16 @@ class NamedEntities(BaseError):
f.write(nraw.encode('utf-8'))
return True
class TooLarge(BaseError):
level = INFO
MAX_SIZE = 260 *1024
HELP = _('This HTML file is larger than %s. Too large HTML files can cause performance problems'
' on some ebook readers. Consider splitting this file into smaller sections.') % human_readable(MAX_SIZE)
def __init__(self, name):
BaseError.__init__(self, _('File too large'), name)
class BadEntity(BaseError):
HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever'
@ -103,6 +113,12 @@ class EntitityProcessor(object):
self.bad_entities.append((m.start(), m.group()))
return b' ' * len(m.group())
def check_html_size(name, mt, raw):
errors = []
if len(raw) > TooLarge.MAX_SIZE:
errors.append(TooLarge(name))
return errors
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
def check_xml_parsing(name, mt, raw):