diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 975ffc1331..25341b120a 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -53,13 +53,15 @@ _CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } -def force_encoding(raw, verbose): +def force_encoding(raw, verbose, assume_utf8=False): from calibre.constants import preferred_encoding try: chardet = detect(raw) except: chardet = {'encoding':preferred_encoding, 'confidence':0} encoding = chardet['encoding'] + if chardet['confidence'] < 1 and assume_utf8: + encoding = 'utf-8' if chardet['confidence'] < 1 and verbose: print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) if not encoding: @@ -73,7 +75,7 @@ def force_encoding(raw, verbose): def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, - resolve_entities=False): + resolve_entities=False, assume_utf8=False): ''' Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and @@ -95,7 +97,7 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, encoding = match.group(1) break if encoding is None: - encoding = force_encoding(raw, verbose) + encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8) try: if encoding.lower().strip() == 'macintosh': encoding = 'mac-roman' diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index 770ee905e3..5099b820d0 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -149,7 +149,8 @@ class TOC(list): def read_ncx_toc(self, toc): self.base_path = os.path.dirname(toc) - soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0]) + raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0] + soup = NCXSoup(raw) def process_navpoint(np, dest): play_order = np.get('playOrder', None) @@ -160,7 +161,7 @@ class TOC(list): if nl is not None: text = u'' for txt in nl.findAll(re.compile('text')): - text += ''.join([unicode(s) for s in txt.findAll(text=True)]) + text += u''.join([unicode(s) for s in txt.findAll(text=True)]) content = np.find(re.compile('content')) if content is None or not content.has_key('src') or not txt: return