From 921a579d35d892cf19dfef112bcb0cbe4f015a40 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 15 Jun 2016 23:50:58 +0530
Subject: [PATCH] EPUB Input: Implement reading of Table of Contents from EPUB
 3 files that do not specify a fallback NCX ToC

---
 .../ebooks/conversion/plugins/epub_input.py   | 60 ++++++++++++++++++-
 src/calibre/ebooks/metadata/opf2.py           | 45 +++++++++-----
 2 files changed, 89 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plugins/epub_input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py
index 8aa1bcf4ae..80ebc57f14 100644
--- a/src/calibre/ebooks/conversion/plugins/epub_input.py
+++ b/src/calibre/ebooks/conversion/plugins/epub_input.py
@@ -218,6 +218,10 @@ class EPUBInput(InputFormatPlugin):
                 raise DRMError(os.path.basename(path))
         self.encrypted_fonts = self._encrypted_font_uris
 
+        epub3_nav = opf.epub3_nav
+        if epub3_nav is not None:
+            self.convert_epub3_nav(epub3_nav, opf, log)
+
         if len(parts) > 1 and parts[0]:
             delta = '/'.join(parts[:-1])+'/'
             for elem in opf.itermanifest():
@@ -252,11 +256,65 @@ class EPUBInput(InputFormatPlugin):
         if len(list(opf.iterspine())) == 0:
             raise ValueError('No valid entries in the spine of this EPUB')
 
-        with open('content.opf', 'wb') as nopf:
+        with lopen('content.opf', 'wb') as nopf:
             nopf.write(opf.render())
 
         return os.path.abspath(u'content.opf')
 
+    def convert_epub3_nav(self, nav_path, opf, log):
+        from lxml import etree
+        from calibre.ebooks.chardet import xml_to_unicode
+        from calibre.ebooks.oeb.polish.parsing import parse
+        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX
+        from calibre.ebooks.oeb.polish.toc import first_child
+        from tempfile import NamedTemporaryFile
+        with lopen(nav_path, 'rb') as f:
+            raw = f.read()
+        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
+        root = parse(raw, log=log)
+        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
+        navmap = ncx[0]
+        et = '{%s}type' % EPUB_NS
+        bn = os.path.basename(nav_path)
+
+        def add_from_li(li, parent):
+            href = text = None
+            for x in li.iterchildren(XHTML('a'), XHTML('span')):
+                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
+                href = x.get('href')
+                if href:
+                    if href.startswith('#'):
+                        href = bn + href
+                break
+            np = parent.makeelement(NCX('navPoint'))
+            parent.append(np)
+            np.append(np.makeelement(NCX('navLabel')))
+            np[0].append(np.makeelement(NCX('text')))
+            np[0][0].text = text
+            if href:
+                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
+            return np
+
+        def process_nav_node(node, toc_parent):
+            for li in node.iterchildren(XHTML('li')):
+                child = add_from_li(li, toc_parent)
+                ol = first_child(li, XHTML('ol'))
+                if child is not None and ol is not None:
+                    process_nav_node(ol, child)
+
+        for nav in root.iterdescendants(XHTML('nav')):
+            if nav.get(et) == 'toc':
+                ol = first_child(nav, XHTML('ol'))
+                if ol is not None:
+                    process_nav_node(ol, navmap)
+                    break
+
+        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
+            f.write(etree.tostring(ncx, encoding='utf-8'))
+        ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME)
+        for spine in opf.root.xpath('//*[local-name()="spine"]'):
+            spine.set('toc', ncx_id)
+
     def postprocess_book(self, oeb, opts, log):
         rc = getattr(self, 'removed_cover', None)
         if rc:
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 262d59aa82..bfe1095aad 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -219,17 +219,20 @@ class ManifestItem(Resource):  # {{{
 
 class Manifest(ResourceCollection):  # {{{
 
+    def append_from_opf_manifest_item(self, item, dir):
+        self.append(ManifestItem.from_opf_manifest_item(item, dir))
+        id = item.get('id', '')
+        if not id:
+            id = 'id%d'%self.next_id
+        self[-1].id = id
+        self.next_id += 1
+
     @staticmethod
     def from_opf_manifest_element(items, dir):
         m = Manifest()
         for item in items:
             try:
-                m.append(ManifestItem.from_opf_manifest_item(item, dir))
-                id = item.get('id', '')
-                if not id:
-                    id = 'id%d'%m.next_id
-                m[-1].id = id
-                m.next_id += 1
+                m.append_from_opf_manifest_item(item, dir)
             except ValueError:
                 continue
         return m
@@ -660,7 +663,6 @@ class OPF(object):  # {{{
                 for item in self.manifest:
                     if 'toc' in item.href().lower():
                         toc = item.path
-
             if toc is None:
                 return
             self.toc = TOC(base_path=self.base_dir)
@@ -721,18 +723,17 @@ class OPF(object):  # {{{
         return [i.get('id') for i in items]
 
     def add_path_to_manifest(self, path, media_type):
-        has_path = False
         path = os.path.abspath(path)
         for i in self.itermanifest():
             xpath = os.path.join(self.base_dir, *(i.get('href', '').split('/')))
             if os.path.abspath(xpath) == path:
-                has_path = True
-                break
-        if not has_path:
-            href = os.path.relpath(path, self.base_dir).replace(os.sep, '/')
-            item = self.create_manifest_item(href, media_type)
-            manifest = self.manifest_ppath(self.root)[0]
-            manifest.append(item)
+                return i.get('id')
+        href = os.path.relpath(path, self.base_dir).replace(os.sep, '/')
+        item = self.create_manifest_item(href, media_type)
+        manifest = self.manifest_ppath(self.root)[0]
+        manifest.append(item)
+        self.manifest.append_from_opf_manifest_item(item, self.basedir)
+        return item.get('id')
 
     def iterspine(self):
         return self.spine_path(self.root)
@@ -1184,6 +1185,20 @@ class OPF(object):  # {{{
                     if mt and 'xml' not in mt and 'html' not in mt:
                         return item.get('href', None)
 
+    @property
+    def epub3_nav(self):
+        if self.package_version >= 3.0:
+            for item in self.itermanifest():
+                props = (item.get('properties') or '').lower().split()
+                if 'nav' in props:
+                    mt = item.get('media-type') or ''
+                    if 'html' in mt.lower():
+                        mid = item.get('id')
+                        if mid:
+                            path = self.manifest.path_for_id(mid)
+                            if path and os.path.exists(path):
+                                return path
+
     @dynamic_property
     def cover(self):