Make parsing of HTML ToC in KF8 more robust

This commit is contained in:
Kovid Goyal 2012-04-01 10:43:12 +05:30
parent 3b72f160e0
commit fa196cf1af

View File

@ -446,6 +446,7 @@ class Mobi8Reader(object):
current_depth = None current_depth = None
parent = ans parent = ans
seen = set() seen = set()
links = []
for elem in root.iterdescendants(etree.Element): for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == XHTML('a') and elem.get('href', if reached and elem.tag == XHTML('a') and elem.get('href',
False): False):
@ -453,24 +454,32 @@ class Mobi8Reader(object):
href, frag = urldefrag(href) href, frag = urldefrag(href)
href = base_href + '/' + href href = base_href + '/' + href
text = xml2text(elem).strip() text = xml2text(elem).strip()
if text in seen: if (text, href, frag) in seen:
continue continue
seen.add(text) seen.add((text, href, frag))
depth = node_depth(elem) links.append((text, href, frag, node_depth(elem)))
if current_depth is None: elif elem is start:
current_depth = depth reached = True
if current_depth == depth:
parent.add_item(href, frag, text) depths = sorted(set(x[-1] for x in links))
elif current_depth < depth: depth_map = {x:i for i, x in enumerate(depths)}
parent = parent[-1] for text, href, frag, depth in links:
parent.add_item(href, frag, text) depth = depth_map[depth]
current_depth = depth if current_depth is None:
else: current_depth = 0
parent = parent.parent parent.add_item(href, frag, text)
parent.add_item(href, frag, text) elif current_depth == depth:
current_depth = depth parent.add_item(href, frag, text)
elif current_depth < depth:
parent = parent[-1] if len(parent) > 0 else parent
parent.add_item(href, frag, text)
current_depth += 1
else: else:
if elem is start: delta = current_depth - depth
reached = True while delta > 0 and parent.parent is not None:
parent = parent.parent
delta -= 1
parent.add_item(href, frag, text)
current_depth = depth
return ans return ans