From 99db7985bd6d221e9d875ab428a412532117e628 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 26 Mar 2014 15:12:57 +0530 Subject: [PATCH] AZW3 Input: Handle files with garbage bytes in their table of contents. Fixes #1297713 [private](https://bugs.launchpad.net/calibre/+bug/1297713) --- src/calibre/ebooks/metadata/toc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index f2f49e2c63..9c19f6b59e 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -13,6 +13,7 @@ from lxml.builder import ElementMaker from calibre.constants import __appname__, __version__ from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.cleantext import clean_xml_chars NCX_NS = "http://www.daisy.org/z3986/2005/ncx/" CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata" @@ -136,7 +137,7 @@ class TOC(list): try: if not os.path.exists(toc): bn = os.path.basename(toc) - bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files + bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files toc = os.path.join(os.path.dirname(toc), bn) self.read_html_toc(toc) @@ -258,6 +259,7 @@ class TOC(list): text = '' c[1] += 1 item_id = 'num_%d'%c[1] + text = clean_xml_chars(text) elem = E.navPoint( E.navLabel(E.text(re.sub(r'\s+', ' ', text))), E.content(src=unicode(np.href)+(('#' + unicode(np.fragment))