mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit Book: Check Book: Add a check for HTML files with non UTF-8 encoding declarations
This commit is contained in:
parent
a09c890518
commit
474455a7fd
@ -44,6 +44,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
|
|||||||
raw = prefix + suffix
|
raw = prefix + suffix
|
||||||
return raw, changed[0]
|
return raw, changed[0]
|
||||||
|
|
||||||
|
def find_declared_encoding(raw, limit=50*1024):
|
||||||
|
prefix = raw[:limit]
|
||||||
|
for pat in ENCODING_PATS:
|
||||||
|
m = pat.search(prefix)
|
||||||
|
if m is not None:
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
def substitute_entites(raw):
|
def substitute_entites(raw):
|
||||||
from calibre import xml_entity_to_unicode
|
from calibre import xml_entity_to_unicode
|
||||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||||
|
@ -14,7 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image
|
|||||||
from calibre.ebooks.oeb.polish.check.base import run_checkers
|
from calibre.ebooks.oeb.polish.check.base import run_checkers
|
||||||
from calibre.ebooks.oeb.polish.check.parsing import (
|
from calibre.ebooks.oeb.polish.check.parsing import (
|
||||||
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag,
|
check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag,
|
||||||
check_html_size, check_ids, EmptyFile)
|
check_html_size, check_ids, EmptyFile, check_encoding_declarations)
|
||||||
from calibre.ebooks.oeb.polish.check.images import check_raster_images
|
from calibre.ebooks.oeb.polish.check.images import check_raster_images
|
||||||
from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations
|
from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations
|
||||||
from calibre.ebooks.oeb.polish.check.fonts import check_fonts
|
from calibre.ebooks.oeb.polish.check.fonts import check_fonts
|
||||||
@ -52,6 +52,9 @@ def run_checks(container):
|
|||||||
continue
|
continue
|
||||||
errors.extend(check_css_parsing(name, raw))
|
errors.extend(check_css_parsing(name, raw))
|
||||||
|
|
||||||
|
for name, mt, raw in html_items + xml_items:
|
||||||
|
errors.extend(check_encoding_declarations(name, container))
|
||||||
|
|
||||||
for name, mt, raw in html_items:
|
for name, mt, raw in html_items:
|
||||||
if not raw:
|
if not raw:
|
||||||
continue
|
continue
|
||||||
|
@ -12,6 +12,7 @@ from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
|||||||
import cssutils
|
import cssutils
|
||||||
|
|
||||||
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
||||||
|
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
|
||||||
from calibre.ebooks.html_entities import html5_entities
|
from calibre.ebooks.html_entities import html5_entities
|
||||||
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
||||||
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
|
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
|
||||||
@ -166,6 +167,24 @@ class BadNamespace(BaseError):
|
|||||||
container.dirty(self.name)
|
container.dirty(self.name)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
class NonUTF8(BaseError):
|
||||||
|
|
||||||
|
level = WARN
|
||||||
|
INDIVIDUAL_FIX = _("Change this file's encoding to UTF-8")
|
||||||
|
|
||||||
|
def __init__(self, name, enc):
|
||||||
|
BaseError.__init__(self, _('Non UTF-8 encoding declaration'), name)
|
||||||
|
self.HELP = _('This file has its encoding declared as %s. Some'
|
||||||
|
' reader software cannot handle non-UTF8 encoded files.'
|
||||||
|
' You should change the encoding to UTF-8.')
|
||||||
|
|
||||||
|
def __call__(self, container):
|
||||||
|
raw = container.raw_data(self.name)
|
||||||
|
if isinstance(raw, type('')):
|
||||||
|
raw, changed = replace_encoding_declarations(raw)
|
||||||
|
if changed:
|
||||||
|
container.open(self.name, 'wb').write(raw.encode('utf-8'))
|
||||||
|
return True
|
||||||
|
|
||||||
class EntitityProcessor(object):
|
class EntitityProcessor(object):
|
||||||
|
|
||||||
@ -208,6 +227,13 @@ def check_html_size(name, mt, raw):
|
|||||||
|
|
||||||
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
|
entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
|
||||||
|
|
||||||
|
def check_encoding_declarations(name, container):
|
||||||
|
errors = []
|
||||||
|
enc = find_declared_encoding(container.raw_data(name))
|
||||||
|
if enc is not None and enc.lower() != 'utf-8':
|
||||||
|
errors.append(NonUTF8(name, enc))
|
||||||
|
return errors
|
||||||
|
|
||||||
def check_xml_parsing(name, mt, raw):
|
def check_xml_parsing(name, mt, raw):
|
||||||
if not raw:
|
if not raw:
|
||||||
return [EmptyFile(name)]
|
return [EmptyFile(name)]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user