diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 0785252c4d..61aefafdac 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -44,6 +44,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): raw = prefix + suffix return raw, changed[0] +def find_declared_encoding(raw, limit=50*1024): + prefix = raw[:limit] + for pat in ENCODING_PATS: + m = pat.search(prefix) + if m is not None: + return m.group(1) + def substitute_entites(raw): from calibre import xml_entity_to_unicode return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py index 57fe657a95..aa0e2f748c 100644 --- a/src/calibre/ebooks/oeb/polish/check/main.py +++ b/src/calibre/ebooks/oeb/polish/check/main.py @@ -14,7 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image from calibre.ebooks.oeb.polish.check.base import run_checkers from calibre.ebooks.oeb.polish.check.parsing import ( check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, - check_html_size, check_ids, EmptyFile) + check_html_size, check_ids, EmptyFile, check_encoding_declarations) from calibre.ebooks.oeb.polish.check.images import check_raster_images from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations from calibre.ebooks.oeb.polish.check.fonts import check_fonts @@ -52,6 +52,9 @@ def run_checks(container): continue errors.extend(check_css_parsing(name, raw)) + for name, mt, raw in html_items + xml_items: + errors.extend(check_encoding_declarations(name, container)) + for name, mt, raw in html_items: if not raw: continue diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index ee868e97c7..3acbcd325a 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -12,6 +12,7 @@ from lxml.etree import XMLParser, fromstring, XMLSyntaxError import cssutils from calibre import force_unicode, human_readable, prepare_string_for_xml +from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type @@ -166,6 +167,24 @@ class BadNamespace(BaseError): container.dirty(self.name) return True +class NonUTF8(BaseError): + + level = WARN + INDIVIDUAL_FIX = _("Change this file's encoding to UTF-8") + + def __init__(self, name, enc): + BaseError.__init__(self, _('Non UTF-8 encoding declaration'), name) + self.HELP = _('This file has its encoding declared as %s. Some' + ' reader software cannot handle non-UTF8 encoded files.' + ' You should change the encoding to UTF-8.') + + def __call__(self, container): + raw = container.raw_data(self.name) + if isinstance(raw, type('')): + raw, changed = replace_encoding_declarations(raw) + if changed: + container.open(self.name, 'wb').write(raw.encode('utf-8')) + return True class EntitityProcessor(object): @@ -208,6 +227,13 @@ def check_html_size(name, mt, raw): entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});') +def check_encoding_declarations(name, container): + errors = [] + enc = find_declared_encoding(container.raw_data(name)) + if enc is not None and enc.lower() != 'utf-8': + errors.append(NonUTF8(name, enc)) + return errors + def check_xml_parsing(name, mt, raw): if not raw: return [EmptyFile(name)]