From 648637439f1a03ed854885d34879c325d31ef2d3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 31 Mar 2012 18:24:53 +0530 Subject: [PATCH] KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead. Fixes #969238 (Private bug) --- src/calibre/ebooks/mobi/reader/mobi8.py | 76 +++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index d2254e00d8..9d4acb6818 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -10,13 +10,19 @@ __docformat__ = 'restructuredtext en' import struct, re, os, imghdr from collections import namedtuple from itertools import repeat +from urlparse import urldefrag + +from lxml import etree from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import read_index from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup from calibre.ebooks.metadata.opf2 import Guide, OPFCreator +from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.mobi.utils import read_font_record +from calibre.ebooks.oeb.parse_utils import parse_html +from calibre.ebooks.oeb.base import XPath, XHTML, xml2text Part = namedtuple('Part', 'num type filename start end aid') @@ -383,6 +389,19 @@ class Mobi8Reader(object): len(resource_map)): mi.cover = resource_map[self.cover_offset] + if len(list(toc)) < 2: + self.log.warn('KF8 has no metadata Table of Contents') + + for ref in guide: + if ref.type == 'toc': + href = ref.href() + href, frag = urldefrag(href) + if os.path.exists(href.replace('/', os.sep)): + try: + toc = self.read_inline_toc(href, frag) + except: + self.log.exception('Failed to read inline ToC') + opf = OPFCreator(os.getcwdu(), mi) opf.guide = guide @@ -397,4 +416,61 @@ class Mobi8Reader(object): opf.render(of, ncx, 'toc.ncx') return 'metadata.opf' + def read_inline_toc(self, href, frag): + ans = TOC() + base_href = '/'.join(href.split('/')[:-1]) + with open(href.replace('/', os.sep), 'rb') as f: + raw = f.read().decode(self.header.codec) + root = parse_html(raw, log=self.log) + body = XPath('//h:body')(root) + reached = False + if body: + start = body[0] + else: + start = None + reached = True + if frag: + elems = XPath('//*[@id="%s"]'%frag) + if elems: + start = elems[0] + + def node_depth(elem): + ans = 0 + parent = elem.getparent() + while parent is not None: + parent = parent.getparent() + ans += 1 + return ans + + # Layer the ToC based on nesting order in the source HTML + current_depth = None + parent = ans + seen = set() + for elem in root.iterdescendants(etree.Element): + if reached and elem.tag == XHTML('a') and elem.get('href', + False): + href = elem.get('href') + href, frag = urldefrag(href) + href = base_href + '/' + href + text = xml2text(elem).strip() + if text in seen: + continue + seen.add(text) + depth = node_depth(elem) + if current_depth is None: + current_depth = depth + if current_depth == depth: + parent.add_item(href, frag, text) + elif current_depth < depth: + parent = parent[-1] + parent.add_item(href, frag, text) + current_depth = depth + else: + parent = parent.parent + parent.add_item(href, frag, text) + current_depth = depth + else: + if elem is start: + reached = True + return ans