Edit Book: Check Book: Add a check for HTML files with non UTF-8 encoding declarations

This commit is contained in:
Kovid Goyal 2014-06-22 17:37:30 +05:30
parent a09c890518
commit 474455a7fd
3 changed files with 37 additions and 1 deletions

View File

@ -44,6 +44,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
raw = prefix + suffix
return raw, changed[0]
def find_declared_encoding(raw, limit=50*1024):
prefix = raw[:limit]
for pat in ENCODING_PATS:
m = pat.search(prefix)
if m is not None:
return m.group(1)
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)

View File

@ -14,7 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image
from calibre.ebooks.oeb.polish.check.base import run_checkers
from calibre.ebooks.oeb.polish.check.parsing import (
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag,
check_html_size, check_ids, EmptyFile)
check_html_size, check_ids, EmptyFile, check_encoding_declarations)
from calibre.ebooks.oeb.polish.check.images import check_raster_images
from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations
from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -52,6 +52,9 @@ def run_checks(container):
continue
errors.extend(check_css_parsing(name, raw))
for name, mt, raw in html_items + xml_items:
errors.extend(check_encoding_declarations(name, container))
for name, mt, raw in html_items:
if not raw:
continue

View File

@ -12,6 +12,7 @@ from lxml.etree import XMLParser, fromstring, XMLSyntaxError
import cssutils
from calibre import force_unicode, human_readable, prepare_string_for_xml
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
@ -166,6 +167,24 @@ class BadNamespace(BaseError):
container.dirty(self.name)
return True
class NonUTF8(BaseError):
level = WARN
INDIVIDUAL_FIX = _("Change this file's encoding to UTF-8")
def __init__(self, name, enc):
BaseError.__init__(self, _('Non UTF-8 encoding declaration'), name)
self.HELP = _('This file has its encoding declared as %s. Some'
' reader software cannot handle non-UTF8 encoded files.'
' You should change the encoding to UTF-8.')
def __call__(self, container):
raw = container.raw_data(self.name)
if isinstance(raw, type('')):
raw, changed = replace_encoding_declarations(raw)
if changed:
container.open(self.name, 'wb').write(raw.encode('utf-8'))
return True
class EntitityProcessor(object):
@ -208,6 +227,13 @@ def check_html_size(name, mt, raw):
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
def check_encoding_declarations(name, container):
errors = []
enc = find_declared_encoding(container.raw_data(name))
if enc is not None and enc.lower() != 'utf-8':
errors.append(NonUTF8(name, enc))
return errors
def check_xml_parsing(name, mt, raw):
if not raw:
return [EmptyFile(name)]