mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When decoding NCX toc files, if no encoding is declared and detection has less that 100% confidence, assume UTF-8. Fixes #5039 (Strange behaviour of TOC for one character)
This commit is contained in:
parent
13a9733d42
commit
833c54c5d2
@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis" }
|
||||
|
||||
|
||||
def force_encoding(raw, verbose):
|
||||
def force_encoding(raw, verbose, assume_utf8=False):
|
||||
from calibre.constants import preferred_encoding
|
||||
try:
|
||||
chardet = detect(raw)
|
||||
except:
|
||||
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
||||
encoding = chardet['encoding']
|
||||
if chardet['confidence'] < 1 and assume_utf8:
|
||||
encoding = 'utf-8'
|
||||
if chardet['confidence'] < 1 and verbose:
|
||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||
if not encoding:
|
||||
@ -73,7 +75,7 @@ def force_encoding(raw, verbose):
|
||||
|
||||
|
||||
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
resolve_entities=False):
|
||||
resolve_entities=False, assume_utf8=False):
|
||||
'''
|
||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||
encoding declaration first, if not found uses the chardet library and
|
||||
@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
encoding = match.group(1)
|
||||
break
|
||||
if encoding is None:
|
||||
encoding = force_encoding(raw, verbose)
|
||||
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
||||
try:
|
||||
if encoding.lower().strip() == 'macintosh':
|
||||
encoding = 'mac-roman'
|
||||
|
@ -149,7 +149,8 @@ class TOC(list):
|
||||
|
||||
def read_ncx_toc(self, toc):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
|
||||
raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
|
||||
soup = NCXSoup(raw)
|
||||
|
||||
def process_navpoint(np, dest):
|
||||
play_order = np.get('playOrder', None)
|
||||
@ -160,7 +161,7 @@ class TOC(list):
|
||||
if nl is not None:
|
||||
text = u''
|
||||
for txt in nl.findAll(re.compile('text')):
|
||||
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||
text += u''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||
content = np.find(re.compile('content'))
|
||||
if content is None or not content.has_key('src') or not txt:
|
||||
return
|
||||
|
Loading…
x
Reference in New Issue
Block a user