mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit Book: Check Book: Add a check for HTML files with non UTF-8 encoding declarations
This commit is contained in:
parent
a09c890518
commit
474455a7fd
@ -44,6 +44,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
|
||||
raw = prefix + suffix
|
||||
return raw, changed[0]
|
||||
|
||||
def find_declared_encoding(raw, limit=50*1024):
|
||||
prefix = raw[:limit]
|
||||
for pat in ENCODING_PATS:
|
||||
m = pat.search(prefix)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
|
||||
def substitute_entites(raw):
|
||||
from calibre import xml_entity_to_unicode
|
||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||
|
@ -14,7 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image
|
||||
from calibre.ebooks.oeb.polish.check.base import run_checkers
|
||||
from calibre.ebooks.oeb.polish.check.parsing import (
|
||||
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag,
|
||||
check_html_size, check_ids, EmptyFile)
|
||||
check_html_size, check_ids, EmptyFile, check_encoding_declarations)
|
||||
from calibre.ebooks.oeb.polish.check.images import check_raster_images
|
||||
from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations
|
||||
from calibre.ebooks.oeb.polish.check.fonts import check_fonts
|
||||
@ -52,6 +52,9 @@ def run_checks(container):
|
||||
continue
|
||||
errors.extend(check_css_parsing(name, raw))
|
||||
|
||||
for name, mt, raw in html_items + xml_items:
|
||||
errors.extend(check_encoding_declarations(name, container))
|
||||
|
||||
for name, mt, raw in html_items:
|
||||
if not raw:
|
||||
continue
|
||||
|
@ -12,6 +12,7 @@ from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
||||
import cssutils
|
||||
|
||||
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
||||
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
|
||||
from calibre.ebooks.html_entities import html5_entities
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
||||
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
|
||||
@ -166,6 +167,24 @@ class BadNamespace(BaseError):
|
||||
container.dirty(self.name)
|
||||
return True
|
||||
|
||||
class NonUTF8(BaseError):
|
||||
|
||||
level = WARN
|
||||
INDIVIDUAL_FIX = _("Change this file's encoding to UTF-8")
|
||||
|
||||
def __init__(self, name, enc):
|
||||
BaseError.__init__(self, _('Non UTF-8 encoding declaration'), name)
|
||||
self.HELP = _('This file has its encoding declared as %s. Some'
|
||||
' reader software cannot handle non-UTF8 encoded files.'
|
||||
' You should change the encoding to UTF-8.')
|
||||
|
||||
def __call__(self, container):
|
||||
raw = container.raw_data(self.name)
|
||||
if isinstance(raw, type('')):
|
||||
raw, changed = replace_encoding_declarations(raw)
|
||||
if changed:
|
||||
container.open(self.name, 'wb').write(raw.encode('utf-8'))
|
||||
return True
|
||||
|
||||
class EntitityProcessor(object):
|
||||
|
||||
@ -208,6 +227,13 @@ def check_html_size(name, mt, raw):
|
||||
|
||||
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
|
||||
|
||||
def check_encoding_declarations(name, container):
|
||||
errors = []
|
||||
enc = find_declared_encoding(container.raw_data(name))
|
||||
if enc is not None and enc.lower() != 'utf-8':
|
||||
errors.append(NonUTF8(name, enc))
|
||||
return errors
|
||||
|
||||
def check_xml_parsing(name, mt, raw):
|
||||
if not raw:
|
||||
return [EmptyFile(name)]
|
||||
|
Loading…
x
Reference in New Issue
Block a user