From 921a579d35d892cf19dfef112bcb0cbe4f015a40 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Jun 2016 23:50:58 +0530 Subject: [PATCH] EPUB Input: Implement reading of Table of Contents from EPUB 3 files that do not specify a fallback NCX ToC --- .../ebooks/conversion/plugins/epub_input.py | 60 ++++++++++++++++++- src/calibre/ebooks/metadata/opf2.py | 45 +++++++++----- 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/epub_input.py b/src/calibre/ebooks/conversion/plugins/epub_input.py index 8aa1bcf4ae..80ebc57f14 100644 --- a/src/calibre/ebooks/conversion/plugins/epub_input.py +++ b/src/calibre/ebooks/conversion/plugins/epub_input.py @@ -218,6 +218,10 @@ class EPUBInput(InputFormatPlugin): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris + epub3_nav = opf.epub3_nav + if epub3_nav is not None: + self.convert_epub3_nav(epub3_nav, opf, log) + if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' for elem in opf.itermanifest(): @@ -252,11 +256,65 @@ class EPUBInput(InputFormatPlugin): if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') - with open('content.opf', 'wb') as nopf: + with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf') + def convert_epub3_nav(self, nav_path, opf, log): + from lxml import etree + from calibre.ebooks.chardet import xml_to_unicode + from calibre.ebooks.oeb.polish.parsing import parse + from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX + from calibre.ebooks.oeb.polish.toc import first_child + from tempfile import NamedTemporaryFile + with lopen(nav_path, 'rb') as f: + raw = f.read() + raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] + root = parse(raw, log=log) + ncx = etree.fromstring('') + navmap = ncx[0] + et = '{%s}type' % EPUB_NS + bn = os.path.basename(nav_path) + + def add_from_li(li, parent): + href = text = None + for x in li.iterchildren(XHTML('a'), XHTML('span')): + text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() + href = x.get('href') + if href: + if href.startswith('#'): + href = bn + href + break + np = parent.makeelement(NCX('navPoint')) + parent.append(np) + np.append(np.makeelement(NCX('navLabel'))) + np[0].append(np.makeelement(NCX('text'))) + np[0][0].text = text + if href: + np.append(np.makeelement(NCX('content'), attrib={'src':href})) + return np + + def process_nav_node(node, toc_parent): + for li in node.iterchildren(XHTML('li')): + child = add_from_li(li, toc_parent) + ol = first_child(li, XHTML('ol')) + if child is not None and ol is not None: + process_nav_node(ol, child) + + for nav in root.iterdescendants(XHTML('nav')): + if nav.get(et) == 'toc': + ol = first_child(nav, XHTML('ol')) + if ol is not None: + process_nav_node(ol, navmap) + break + + with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: + f.write(etree.tostring(ncx, encoding='utf-8')) + ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME) + for spine in opf.root.xpath('//*[local-name()="spine"]'): + spine.set('toc', ncx_id) + def postprocess_book(self, oeb, opts, log): rc = getattr(self, 'removed_cover', None) if rc: diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 262d59aa82..bfe1095aad 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -219,17 +219,20 @@ class ManifestItem(Resource): # {{{ class Manifest(ResourceCollection): # {{{ + def append_from_opf_manifest_item(self, item, dir): + self.append(ManifestItem.from_opf_manifest_item(item, dir)) + id = item.get('id', '') + if not id: + id = 'id%d'%self.next_id + self[-1].id = id + self.next_id += 1 + @staticmethod def from_opf_manifest_element(items, dir): m = Manifest() for item in items: try: - m.append(ManifestItem.from_opf_manifest_item(item, dir)) - id = item.get('id', '') - if not id: - id = 'id%d'%m.next_id - m[-1].id = id - m.next_id += 1 + m.append_from_opf_manifest_item(item, dir) except ValueError: continue return m @@ -660,7 +663,6 @@ class OPF(object): # {{{ for item in self.manifest: if 'toc' in item.href().lower(): toc = item.path - if toc is None: return self.toc = TOC(base_path=self.base_dir) @@ -721,18 +723,17 @@ class OPF(object): # {{{ return [i.get('id') for i in items] def add_path_to_manifest(self, path, media_type): - has_path = False path = os.path.abspath(path) for i in self.itermanifest(): xpath = os.path.join(self.base_dir, *(i.get('href', '').split('/'))) if os.path.abspath(xpath) == path: - has_path = True - break - if not has_path: - href = os.path.relpath(path, self.base_dir).replace(os.sep, '/') - item = self.create_manifest_item(href, media_type) - manifest = self.manifest_ppath(self.root)[0] - manifest.append(item) + return i.get('id') + href = os.path.relpath(path, self.base_dir).replace(os.sep, '/') + item = self.create_manifest_item(href, media_type) + manifest = self.manifest_ppath(self.root)[0] + manifest.append(item) + self.manifest.append_from_opf_manifest_item(item, self.basedir) + return item.get('id') def iterspine(self): return self.spine_path(self.root) @@ -1184,6 +1185,20 @@ class OPF(object): # {{{ if mt and 'xml' not in mt and 'html' not in mt: return item.get('href', None) + @property + def epub3_nav(self): + if self.package_version >= 3.0: + for item in self.itermanifest(): + props = (item.get('properties') or '').lower().split() + if 'nav' in props: + mt = item.get('media-type') or '' + if 'html' in mt.lower(): + mid = item.get('id') + if mid: + path = self.manifest.path_for_id(mid) + if path and os.path.exists(path): + return path + @dynamic_property def cover(self):