When decoding NCX toc files, if no encoding is declared and detection has less that 100% confidence, assume UTF-8. Fixes #5039 (Strange behaviour of TOC for one character)

2025-07-09 03:04:10 -04:00 · 2010-03-03 01:43:38 -07:00 · 2010-03-03 01:43:38 -07:00 · 833c54c5d2
commit 833c54c5d2
parent 13a9733d42
2 changed files with 8 additions and 5 deletions
--- a/src/calibre/ebooks/chardet/init.py
+++ b/src/calibre/ebooks/chardet/init.py
@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman",
                        "x-sjis" : "shift-jis" }


-def force_encoding(raw, verbose):
+def force_encoding(raw, verbose, assume_utf8=False):
    from calibre.constants import preferred_encoding
    try:
        chardet = detect(raw)
    except:
        chardet = {'encoding':preferred_encoding, 'confidence':0}
    encoding = chardet['encoding']
+    if chardet['confidence'] < 1 and assume_utf8:
+        encoding = 'utf-8'
    if chardet['confidence'] < 1 and verbose:
        print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
    if not encoding:
@ -73,7 +75,7 @@ def force_encoding(raw, verbose):


 def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
-                   resolve_entities=False):
+                   resolve_entities=False, assume_utf8=False):
    '''
    Force conversion of byte string to unicode. Tries to look for XML/HTML
    encoding declaration first, if not found uses the chardet library and
@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
                encoding = match.group(1)
                break
        if encoding is None:
-            encoding = force_encoding(raw, verbose)
+            encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
        try:
            if encoding.lower().strip() == 'macintosh':
                encoding = 'mac-roman'
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -149,7 +149,8 @@ class TOC(list):

    def read_ncx_toc(self, toc):
        self.base_path = os.path.dirname(toc)
-        soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
+        raw  = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
+        soup = NCXSoup(raw)

        def process_navpoint(np, dest):
            play_order = np.get('playOrder', None)
@ -160,7 +161,7 @@ class TOC(list):
            if nl is not None:
                text = u''
                for txt in nl.findAll(re.compile('text')):
-                    text += ''.join([unicode(s) for s in txt.findAll(text=True)])
+                    text += u''.join([unicode(s) for s in txt.findAll(text=True)])
                content = np.find(re.compile('content'))
                if content is None or not content.has_key('src') or not txt:
                    return