HTML Input: Fix regression that broke processing of a small fraction of HTML files encoded in a multi-byte character encoding. Fixes #899691 (HTML input rarely saved as "html" not "zip")

This commit is contained in:
Kovid Goyal 2011-12-04 10:00:28 +05:30
parent 527869d5c1
commit a45ea253c8
2 changed files with 36 additions and 34 deletions

View File

@ -53,7 +53,6 @@ def substitute_entites(raw):
_CHARSET_ALIASES = { "macintosh" : "mac-roman", _CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" } "x-sjis" : "shift-jis" }
def force_encoding(raw, verbose, assume_utf8=False): def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
try: try:
@ -74,6 +73,36 @@ def force_encoding(raw, verbose, assume_utf8=False):
encoding = 'utf-8' encoding = 'utf-8'
return encoding return encoding
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
if not raw or isinstance(raw, unicode):
return raw, None
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
'-', '_'))
if raw.startswith(bom):
return raw[len(bom):], x
encoding = None
for pat in ENCODING_PATS:
match = pat.search(raw)
if match:
encoding = match.group(1)
break
if encoding is None:
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman'
if encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'utf-8'
return raw, encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False, assume_utf8=False): resolve_entities=False, assume_utf8=False):
@ -83,43 +112,16 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
prints a warning if detection confidence is < 100% prints a warning if detection confidence is < 100%
@return: (unicode, encoding used) @return: (unicode, encoding used)
''' '''
encoding = None
if not raw: if not raw:
return u'', encoding return u'', None
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
assume_utf8=assume_utf8)
if not isinstance(raw, unicode): if not isinstance(raw, unicode):
if raw.startswith(codecs.BOM_UTF8): raw = raw.decode(encoding, 'replace')
raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
elif raw.startswith(codecs.BOM_UTF16_LE):
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
elif raw.startswith(codecs.BOM_UTF16_BE):
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
if not isinstance(raw, unicode):
for pat in ENCODING_PATS:
match = pat.search(raw)
if match:
encoding = match.group(1)
break
if encoding is None:
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
try:
if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman'
if encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
raw = raw.decode(encoding, 'replace')
except LookupError:
encoding = 'utf-8'
raw = raw.decode(encoding, 'replace')
if strip_encoding_pats: if strip_encoding_pats:
raw = strip_encoding_declarations(raw) raw = strip_encoding_declarations(raw)
if resolve_entities: if resolve_entities:
raw = substitute_entites(raw) raw = substitute_entites(raw)
return raw, encoding return raw, encoding

View File

@ -18,7 +18,7 @@ from functools import partial
from itertools import izip from itertools import izip
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import detect_xml_encoding
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre.constants import islinux, isbsd, iswindows from calibre.constants import islinux, isbsd, iswindows
from calibre import unicode_path, as_unicode from calibre import unicode_path, as_unicode
@ -121,7 +121,7 @@ class HTMLFile(object):
if not self.is_binary: if not self.is_binary:
if not encoding: if not encoding:
encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1] encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
self.encoding = encoding self.encoding = encoding
else: else:
self.encoding = encoding self.encoding = encoding