mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When decoding NCX toc files, if no encoding is declared and detection has less that 100% confidence, assume UTF-8. Fixes #5039 (Strange behaviour of TOC for one character)
This commit is contained in:
parent
13a9733d42
commit
833c54c5d2
@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
|||||||
"x-sjis" : "shift-jis" }
|
"x-sjis" : "shift-jis" }
|
||||||
|
|
||||||
|
|
||||||
def force_encoding(raw, verbose):
|
def force_encoding(raw, verbose, assume_utf8=False):
|
||||||
from calibre.constants import preferred_encoding
|
from calibre.constants import preferred_encoding
|
||||||
try:
|
try:
|
||||||
chardet = detect(raw)
|
chardet = detect(raw)
|
||||||
except:
|
except:
|
||||||
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
||||||
encoding = chardet['encoding']
|
encoding = chardet['encoding']
|
||||||
|
if chardet['confidence'] < 1 and assume_utf8:
|
||||||
|
encoding = 'utf-8'
|
||||||
if chardet['confidence'] < 1 and verbose:
|
if chardet['confidence'] < 1 and verbose:
|
||||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||||
if not encoding:
|
if not encoding:
|
||||||
@ -73,7 +75,7 @@ def force_encoding(raw, verbose):
|
|||||||
|
|
||||||
|
|
||||||
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||||
resolve_entities=False):
|
resolve_entities=False, assume_utf8=False):
|
||||||
'''
|
'''
|
||||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||||
encoding declaration first, if not found uses the chardet library and
|
encoding declaration first, if not found uses the chardet library and
|
||||||
@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
|||||||
encoding = match.group(1)
|
encoding = match.group(1)
|
||||||
break
|
break
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
encoding = force_encoding(raw, verbose)
|
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
||||||
try:
|
try:
|
||||||
if encoding.lower().strip() == 'macintosh':
|
if encoding.lower().strip() == 'macintosh':
|
||||||
encoding = 'mac-roman'
|
encoding = 'mac-roman'
|
||||||
|
@ -149,7 +149,8 @@ class TOC(list):
|
|||||||
|
|
||||||
def read_ncx_toc(self, toc):
|
def read_ncx_toc(self, toc):
|
||||||
self.base_path = os.path.dirname(toc)
|
self.base_path = os.path.dirname(toc)
|
||||||
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
|
raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
|
||||||
|
soup = NCXSoup(raw)
|
||||||
|
|
||||||
def process_navpoint(np, dest):
|
def process_navpoint(np, dest):
|
||||||
play_order = np.get('playOrder', None)
|
play_order = np.get('playOrder', None)
|
||||||
@ -160,7 +161,7 @@ class TOC(list):
|
|||||||
if nl is not None:
|
if nl is not None:
|
||||||
text = u''
|
text = u''
|
||||||
for txt in nl.findAll(re.compile('text')):
|
for txt in nl.findAll(re.compile('text')):
|
||||||
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
text += u''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||||
content = np.find(re.compile('content'))
|
content = np.find(re.compile('content'))
|
||||||
if content is None or not content.has_key('src') or not txt:
|
if content is None or not content.has_key('src') or not txt:
|
||||||
return
|
return
|
||||||
|
Loading…
x
Reference in New Issue
Block a user