mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Input: When the KF8 book has no metadata ToC, try to extract the ToC from the HTML instead. Fixes #969238 (Private bug)
This commit is contained in:
parent
6c563866c6
commit
648637439f
@ -10,13 +10,19 @@ __docformat__ = 'restructuredtext en'
|
||||
import struct, re, os, imghdr
|
||||
from collections import namedtuple
|
||||
from itertools import repeat
|
||||
from urlparse import urldefrag
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
|
||||
from calibre.ebooks.mobi.reader.index import read_index
|
||||
from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
|
||||
from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
|
||||
from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.mobi.utils import read_font_record
|
||||
from calibre.ebooks.oeb.parse_utils import parse_html
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
|
||||
|
||||
Part = namedtuple('Part',
|
||||
'num type filename start end aid')
|
||||
@ -383,6 +389,19 @@ class Mobi8Reader(object):
|
||||
len(resource_map)):
|
||||
mi.cover = resource_map[self.cover_offset]
|
||||
|
||||
if len(list(toc)) < 2:
|
||||
self.log.warn('KF8 has no metadata Table of Contents')
|
||||
|
||||
for ref in guide:
|
||||
if ref.type == 'toc':
|
||||
href = ref.href()
|
||||
href, frag = urldefrag(href)
|
||||
if os.path.exists(href.replace('/', os.sep)):
|
||||
try:
|
||||
toc = self.read_inline_toc(href, frag)
|
||||
except:
|
||||
self.log.exception('Failed to read inline ToC')
|
||||
|
||||
opf = OPFCreator(os.getcwdu(), mi)
|
||||
opf.guide = guide
|
||||
|
||||
@ -397,4 +416,61 @@ class Mobi8Reader(object):
|
||||
opf.render(of, ncx, 'toc.ncx')
|
||||
return 'metadata.opf'
|
||||
|
||||
def read_inline_toc(self, href, frag):
|
||||
ans = TOC()
|
||||
base_href = '/'.join(href.split('/')[:-1])
|
||||
with open(href.replace('/', os.sep), 'rb') as f:
|
||||
raw = f.read().decode(self.header.codec)
|
||||
root = parse_html(raw, log=self.log)
|
||||
body = XPath('//h:body')(root)
|
||||
reached = False
|
||||
if body:
|
||||
start = body[0]
|
||||
else:
|
||||
start = None
|
||||
reached = True
|
||||
if frag:
|
||||
elems = XPath('//*[@id="%s"]'%frag)
|
||||
if elems:
|
||||
start = elems[0]
|
||||
|
||||
def node_depth(elem):
|
||||
ans = 0
|
||||
parent = elem.getparent()
|
||||
while parent is not None:
|
||||
parent = parent.getparent()
|
||||
ans += 1
|
||||
return ans
|
||||
|
||||
# Layer the ToC based on nesting order in the source HTML
|
||||
current_depth = None
|
||||
parent = ans
|
||||
seen = set()
|
||||
for elem in root.iterdescendants(etree.Element):
|
||||
if reached and elem.tag == XHTML('a') and elem.get('href',
|
||||
False):
|
||||
href = elem.get('href')
|
||||
href, frag = urldefrag(href)
|
||||
href = base_href + '/' + href
|
||||
text = xml2text(elem).strip()
|
||||
if text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
depth = node_depth(elem)
|
||||
if current_depth is None:
|
||||
current_depth = depth
|
||||
if current_depth == depth:
|
||||
parent.add_item(href, frag, text)
|
||||
elif current_depth < depth:
|
||||
parent = parent[-1]
|
||||
parent.add_item(href, frag, text)
|
||||
current_depth = depth
|
||||
else:
|
||||
parent = parent.parent
|
||||
parent.add_item(href, frag, text)
|
||||
current_depth = depth
|
||||
else:
|
||||
if elem is start:
|
||||
reached = True
|
||||
return ans
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user