mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
HTML Input: Fix regression that broke processing of a small fraction of HTML files encoded in a multi-byte character encoding. Fixes #899691 (HTML input rarely saved as "html" not "zip")
This commit is contained in:
parent
527869d5c1
commit
a45ea253c8
@ -53,7 +53,6 @@ def substitute_entites(raw):
|
|||||||
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||||
"x-sjis" : "shift-jis" }
|
"x-sjis" : "shift-jis" }
|
||||||
|
|
||||||
|
|
||||||
def force_encoding(raw, verbose, assume_utf8=False):
|
def force_encoding(raw, verbose, assume_utf8=False):
|
||||||
from calibre.constants import preferred_encoding
|
from calibre.constants import preferred_encoding
|
||||||
try:
|
try:
|
||||||
@ -74,6 +73,36 @@ def force_encoding(raw, verbose, assume_utf8=False):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
|
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
|
||||||
|
if not raw or isinstance(raw, unicode):
|
||||||
|
return raw, None
|
||||||
|
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
|
||||||
|
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
|
||||||
|
'-', '_'))
|
||||||
|
if raw.startswith(bom):
|
||||||
|
return raw[len(bom):], x
|
||||||
|
encoding = None
|
||||||
|
for pat in ENCODING_PATS:
|
||||||
|
match = pat.search(raw)
|
||||||
|
if match:
|
||||||
|
encoding = match.group(1)
|
||||||
|
break
|
||||||
|
if encoding is None:
|
||||||
|
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
||||||
|
if encoding.lower().strip() == 'macintosh':
|
||||||
|
encoding = 'mac-roman'
|
||||||
|
if encoding.lower().replace('_', '-').strip() in (
|
||||||
|
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||||
|
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||||
|
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||||
|
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||||
|
encoding = 'gbk'
|
||||||
|
try:
|
||||||
|
codecs.lookup(encoding)
|
||||||
|
except LookupError:
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
return raw, encoding
|
||||||
|
|
||||||
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||||
resolve_entities=False, assume_utf8=False):
|
resolve_entities=False, assume_utf8=False):
|
||||||
@ -83,43 +112,16 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
|||||||
prints a warning if detection confidence is < 100%
|
prints a warning if detection confidence is < 100%
|
||||||
@return: (unicode, encoding used)
|
@return: (unicode, encoding used)
|
||||||
'''
|
'''
|
||||||
encoding = None
|
|
||||||
if not raw:
|
if not raw:
|
||||||
return u'', encoding
|
return u'', None
|
||||||
|
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
|
||||||
|
assume_utf8=assume_utf8)
|
||||||
if not isinstance(raw, unicode):
|
if not isinstance(raw, unicode):
|
||||||
if raw.startswith(codecs.BOM_UTF8):
|
raw = raw.decode(encoding, 'replace')
|
||||||
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
|
|
||||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
|
||||||
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
|
||||||
elif raw.startswith(codecs.BOM_UTF16_BE):
|
|
||||||
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
|
||||||
if not isinstance(raw, unicode):
|
|
||||||
for pat in ENCODING_PATS:
|
|
||||||
match = pat.search(raw)
|
|
||||||
if match:
|
|
||||||
encoding = match.group(1)
|
|
||||||
break
|
|
||||||
if encoding is None:
|
|
||||||
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
|
||||||
try:
|
|
||||||
if encoding.lower().strip() == 'macintosh':
|
|
||||||
encoding = 'mac-roman'
|
|
||||||
if encoding.lower().replace('_', '-').strip() in (
|
|
||||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
|
||||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
|
||||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
|
||||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
|
||||||
encoding = 'gbk'
|
|
||||||
raw = raw.decode(encoding, 'replace')
|
|
||||||
except LookupError:
|
|
||||||
encoding = 'utf-8'
|
|
||||||
raw = raw.decode(encoding, 'replace')
|
|
||||||
|
|
||||||
if strip_encoding_pats:
|
if strip_encoding_pats:
|
||||||
raw = strip_encoding_declarations(raw)
|
raw = strip_encoding_declarations(raw)
|
||||||
if resolve_entities:
|
if resolve_entities:
|
||||||
raw = substitute_entites(raw)
|
raw = substitute_entites(raw)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return raw, encoding
|
return raw, encoding
|
||||||
|
@ -18,7 +18,7 @@ from functools import partial
|
|||||||
from itertools import izip
|
from itertools import izip
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import detect_xml_encoding
|
||||||
from calibre.customize.conversion import OptionRecommendation
|
from calibre.customize.conversion import OptionRecommendation
|
||||||
from calibre.constants import islinux, isbsd, iswindows
|
from calibre.constants import islinux, isbsd, iswindows
|
||||||
from calibre import unicode_path, as_unicode
|
from calibre import unicode_path, as_unicode
|
||||||
@ -121,7 +121,7 @@ class HTMLFile(object):
|
|||||||
|
|
||||||
if not self.is_binary:
|
if not self.is_binary:
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
|
encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
else:
|
else:
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
Loading…
x
Reference in New Issue
Block a user