From c497dc1097e708c724ce9ce7223dc37b13d6f94c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 1 Jan 2014 11:29:03 +0530 Subject: [PATCH] Edit book: Add a check for too large HTML files when running the check book tool. Fixes #1264680 [[Feature Request]: Edit Book 300/100 kb warning on HTML](https://bugs.launchpad.net/calibre/+bug/1264680) --- src/calibre/ebooks/oeb/polish/check/main.py | 3 ++- .../ebooks/oeb/polish/check/parsing.py | 20 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py index 146d847171..9e61a7ea5e 100644 --- a/src/calibre/ebooks/oeb/polish/check/main.py +++ b/src/calibre/ebooks/oeb/polish/check/main.py @@ -12,7 +12,7 @@ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES from calibre.ebooks.oeb.polish.container import guess_type from calibre.ebooks.oeb.polish.cover import is_raster_image from calibre.ebooks.oeb.polish.check.base import run_checkers -from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing, check_css_parsing, fix_style_tag +from calibre.ebooks.oeb.polish.check.parsing import check_xml_parsing, check_css_parsing, fix_style_tag, check_html_size from calibre.ebooks.oeb.polish.check.images import check_raster_images from calibre.ebooks.oeb.polish.check.links import check_links from calibre.ebooks.oeb.polish.check.fonts import check_fonts @@ -37,6 +37,7 @@ def run_checks(container): items = raster_images if items is not None: items.append((name, mt, container.open(name, 'rb').read())) + errors.extend(run_checkers(check_html_size, html_items)) errors.extend(run_checkers(check_xml_parsing, xml_items)) errors.extend(run_checkers(check_xml_parsing, html_items)) errors.extend(run_checkers(check_raster_images, raster_images)) diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index 4e3a2268db..c9ae875e0f 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -11,11 +11,11 @@ import re from lxml.etree import XMLParser, fromstring, XMLSyntaxError import cssutils -from calibre import force_unicode +from calibre import force_unicode, human_readable from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag from calibre.ebooks.oeb.polish.utils import PositionFinder -from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR +from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO from calibre.ebooks.oeb.base import OEB_DOCS HTML_ENTITTIES = frozenset(html5_entities) @@ -61,6 +61,16 @@ class NamedEntities(BaseError): f.write(nraw.encode('utf-8')) return True +class TooLarge(BaseError): + + level = INFO + MAX_SIZE = 260 *1024 + HELP = _('This HTML file is larger than %s. Too large HTML files can cause performance problems' + ' on some ebook readers. Consider splitting this file into smaller sections.') % human_readable(MAX_SIZE) + + def __init__(self, name): + BaseError.__init__(self, _('File too large'), name) + class BadEntity(BaseError): HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever' @@ -103,6 +113,12 @@ class EntitityProcessor(object): self.bad_entities.append((m.start(), m.group())) return b' ' * len(m.group()) +def check_html_size(name, mt, raw): + errors = [] + if len(raw) > TooLarge.MAX_SIZE: + errors.append(TooLarge(name)) + return errors + entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});') def check_xml_parsing(name, mt, raw):