When decoding NCX toc files, if no encoding is declared and detection has less that 100% confidence, assume UTF-8. Fixes #5039 (Strange behaviour of TOC for one character)

This commit is contained in:
Kovid Goyal 2010-03-03 01:43:38 -07:00
parent 13a9733d42
commit 833c54c5d2
2 changed files with 8 additions and 5 deletions

View File

@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
def force_encoding(raw, verbose):
def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw)
except:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1 and assume_utf8:
encoding = 'utf-8'
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
if not encoding:
@ -73,7 +75,7 @@ def force_encoding(raw, verbose):
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False):
resolve_entities=False, assume_utf8=False):
'''
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
encoding = match.group(1)
break
if encoding is None:
encoding = force_encoding(raw, verbose)
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
try:
if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman'

View File

@ -149,7 +149,8 @@ class TOC(list):
def read_ncx_toc(self, toc):
self.base_path = os.path.dirname(toc)
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
soup = NCXSoup(raw)
def process_navpoint(np, dest):
play_order = np.get('playOrder', None)
@ -160,7 +161,7 @@ class TOC(list):
if nl is not None:
text = u''
for txt in nl.findAll(re.compile('text')):
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
text += u''.join([unicode(s) for s in txt.findAll(text=True)])
content = np.find(re.compile('content'))
if content is None or not content.has_key('src') or not txt:
return