Edit Book: Check Book: Add a check for HTML files with non UTF-8 encoding declarations

2025-07-09 03:04:10 -04:00 · 2014-06-22 17:37:30 +05:30 · 2014-06-22 17:37:30 +05:30 · 474455a7fd
commit 474455a7fd
parent a09c890518
3 changed files with 37 additions and 1 deletions
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@ -44,6 +44,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
    raw = prefix + suffix
    return raw, changed[0]

+def find_declared_encoding(raw, limit=50*1024):
+    prefix = raw[:limit]
+    for pat in ENCODING_PATS:
+        m = pat.search(prefix)
+        if m is not None:
+            return m.group(1)
+
 def substitute_entites(raw):
    from calibre import xml_entity_to_unicode
    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
--- a/src/calibre/ebooks/oeb/polish/check/main.py
+++ b/src/calibre/ebooks/oeb/polish/check/main.py
@ -14,7 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image
 from calibre.ebooks.oeb.polish.check.base import run_checkers
 from calibre.ebooks.oeb.polish.check.parsing import (
    check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag,
-    check_html_size, check_ids, EmptyFile)
+    check_html_size, check_ids, EmptyFile, check_encoding_declarations)
 from calibre.ebooks.oeb.polish.check.images import check_raster_images
 from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations
 from calibre.ebooks.oeb.polish.check.fonts import check_fonts
@ -52,6 +52,9 @@ def run_checks(container):
            continue
        errors.extend(check_css_parsing(name, raw))

+    for name, mt, raw in html_items + xml_items:
+        errors.extend(check_encoding_declarations(name, container))
+
    for name, mt, raw in html_items:
        if not raw:
            continue
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@ -12,6 +12,7 @@ from lxml.etree import XMLParser, fromstring, XMLSyntaxError
 import cssutils

 from calibre import force_unicode, human_readable, prepare_string_for_xml
+from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
 from calibre.ebooks.html_entities import html5_entities
 from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
 from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
@ -166,6 +167,24 @@ class BadNamespace(BaseError):
        container.dirty(self.name)
        return True

+class NonUTF8(BaseError):
+
+    level = WARN
+    INDIVIDUAL_FIX = _("Change this file's encoding to UTF-8")
+
+    def __init__(self, name, enc):
+        BaseError.__init__(self, _('Non UTF-8 encoding declaration'), name)
+        self.HELP = _('This file has its encoding declared as %s. Some'
+                      ' reader software cannot handle non-UTF8 encoded files.'
+                      ' You should change the encoding to UTF-8.')
+
+    def __call__(self, container):
+        raw = container.raw_data(self.name)
+        if isinstance(raw, type('')):
+            raw, changed = replace_encoding_declarations(raw)
+            if changed:
+                container.open(self.name, 'wb').write(raw.encode('utf-8'))
+                return True

 class EntitityProcessor(object):

@ -208,6 +227,13 @@ def check_html_size(name, mt, raw):

 entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')

+def check_encoding_declarations(name, container):
+    errors = []
+    enc = find_declared_encoding(container.raw_data(name))
+    if enc is not None and enc.lower() != 'utf-8':
+        errors.append(NonUTF8(name, enc))
+    return errors
+
 def check_xml_parsing(name, mt, raw):
    if not raw:
        return [EmptyFile(name)]