mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix detection of UTF-16 and UTF-32 in UnicodeDammit
This commit is contained in:
parent
782b195531
commit
731811e871
@ -1663,6 +1663,7 @@ class UnicodeDammit:
|
|||||||
self._detectEncoding(markup)
|
self._detectEncoding(markup)
|
||||||
self.smartQuotesTo = smartQuotesTo
|
self.smartQuotesTo = smartQuotesTo
|
||||||
self.triedEncodings = []
|
self.triedEncodings = []
|
||||||
|
|
||||||
if markup == '' or isinstance(markup, unicode):
|
if markup == '' or isinstance(markup, unicode):
|
||||||
self.originalEncoding = None
|
self.originalEncoding = None
|
||||||
self.unicode = unicode(markup)
|
self.unicode = unicode(markup)
|
||||||
@ -1722,8 +1723,8 @@ class UnicodeDammit:
|
|||||||
self.markup = u
|
self.markup = u
|
||||||
self.originalEncoding = proposed
|
self.originalEncoding = proposed
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# print "That didn't work!"
|
#print "That didn't work!"
|
||||||
# print e
|
#print e
|
||||||
return None
|
return None
|
||||||
#print "Correct encoding: %s" % proposed
|
#print "Correct encoding: %s" % proposed
|
||||||
return self.markup
|
return self.markup
|
||||||
@ -1750,7 +1751,9 @@ class UnicodeDammit:
|
|||||||
elif data[:4] == '\xff\xfe\x00\x00':
|
elif data[:4] == '\xff\xfe\x00\x00':
|
||||||
encoding = 'utf-32le'
|
encoding = 'utf-32le'
|
||||||
data = data[4:]
|
data = data[4:]
|
||||||
|
|
||||||
newdata = unicode(data, encoding)
|
newdata = unicode(data, encoding)
|
||||||
|
|
||||||
return newdata
|
return newdata
|
||||||
|
|
||||||
def _detectEncoding(self, xml_data):
|
def _detectEncoding(self, xml_data):
|
||||||
@ -1760,44 +1763,48 @@ class UnicodeDammit:
|
|||||||
if xml_data[:4] == '\x4c\x6f\xa7\x94':
|
if xml_data[:4] == '\x4c\x6f\xa7\x94':
|
||||||
# EBCDIC
|
# EBCDIC
|
||||||
xml_data = self._ebcdic_to_ascii(xml_data)
|
xml_data = self._ebcdic_to_ascii(xml_data)
|
||||||
|
|
||||||
|
# By Kovid commented out all the recoding to UTF-8 of UTF-16 and UTF-32
|
||||||
|
# as this doesn't make sense and doesn't work for the test case
|
||||||
|
# BeautifulSoup.UnicodeDammit(u'abcd'.encode('utf-16')).unicode
|
||||||
elif xml_data[:4] == '\x00\x3c\x00\x3f':
|
elif xml_data[:4] == '\x00\x3c\x00\x3f':
|
||||||
# UTF-16BE
|
# UTF-16BE
|
||||||
sniffed_xml_encoding = 'utf-16be'
|
sniffed_xml_encoding = 'utf-16be'
|
||||||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
#xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
||||||
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
|
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
|
||||||
and (xml_data[2:4] != '\x00\x00'):
|
and (xml_data[2:4] != '\x00\x00'):
|
||||||
# UTF-16BE with BOM
|
# UTF-16BE with BOM
|
||||||
sniffed_xml_encoding = 'utf-16be'
|
sniffed_xml_encoding = 'utf-16be'
|
||||||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
#xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
||||||
elif xml_data[:4] == '\x3c\x00\x3f\x00':
|
elif xml_data[:4] == '\x3c\x00\x3f\x00':
|
||||||
# UTF-16LE
|
# UTF-16LE
|
||||||
sniffed_xml_encoding = 'utf-16le'
|
sniffed_xml_encoding = 'utf-16le'
|
||||||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
#xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
||||||
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
|
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
|
||||||
(xml_data[2:4] != '\x00\x00'):
|
(xml_data[2:4] != '\x00\x00'):
|
||||||
# UTF-16LE with BOM
|
# UTF-16LE with BOM
|
||||||
sniffed_xml_encoding = 'utf-16le'
|
sniffed_xml_encoding = 'utf-16le'
|
||||||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
#xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
||||||
elif xml_data[:4] == '\x00\x00\x00\x3c':
|
elif xml_data[:4] == '\x00\x00\x00\x3c':
|
||||||
# UTF-32BE
|
# UTF-32BE
|
||||||
sniffed_xml_encoding = 'utf-32be'
|
sniffed_xml_encoding = 'utf-32be'
|
||||||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
#xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
||||||
elif xml_data[:4] == '\x3c\x00\x00\x00':
|
elif xml_data[:4] == '\x3c\x00\x00\x00':
|
||||||
# UTF-32LE
|
# UTF-32LE
|
||||||
sniffed_xml_encoding = 'utf-32le'
|
sniffed_xml_encoding = 'utf-32le'
|
||||||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
#xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
||||||
elif xml_data[:4] == '\x00\x00\xfe\xff':
|
elif xml_data[:4] == '\x00\x00\xfe\xff':
|
||||||
# UTF-32BE with BOM
|
# UTF-32BE with BOM
|
||||||
sniffed_xml_encoding = 'utf-32be'
|
sniffed_xml_encoding = 'utf-32be'
|
||||||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
#xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
||||||
elif xml_data[:4] == '\xff\xfe\x00\x00':
|
elif xml_data[:4] == '\xff\xfe\x00\x00':
|
||||||
# UTF-32LE with BOM
|
# UTF-32LE with BOM
|
||||||
sniffed_xml_encoding = 'utf-32le'
|
sniffed_xml_encoding = 'utf-32le'
|
||||||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
#xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
||||||
elif xml_data[:3] == '\xef\xbb\xbf':
|
elif xml_data[:3] == '\xef\xbb\xbf':
|
||||||
# UTF-8 with BOM
|
# UTF-8 with BOM
|
||||||
sniffed_xml_encoding = 'utf-8'
|
sniffed_xml_encoding = 'utf-8'
|
||||||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
#xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
||||||
else:
|
else:
|
||||||
sniffed_xml_encoding = 'ascii'
|
sniffed_xml_encoding = 'ascii'
|
||||||
pass
|
pass
|
||||||
@ -1810,6 +1817,7 @@ class UnicodeDammit:
|
|||||||
xml_encoding_match = None
|
xml_encoding_match = None
|
||||||
if xml_encoding_match:
|
if xml_encoding_match:
|
||||||
xml_encoding = xml_encoding_match.groups()[0].lower()
|
xml_encoding = xml_encoding_match.groups()[0].lower()
|
||||||
|
|
||||||
if sniffed_xml_encoding and \
|
if sniffed_xml_encoding and \
|
||||||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
||||||
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user