Make parsing of HTML ToC in KF8 more robust

This commit is contained in:
Kovid Goyal 2012-04-01 10:43:12 +05:30
parent 3b72f160e0
commit fa196cf1af

View File

@ -446,6 +446,7 @@ class Mobi8Reader(object):
current_depth = None current_depth = None
parent = ans parent = ans
seen = set() seen = set()
links = []
for elem in root.iterdescendants(etree.Element): for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == XHTML('a') and elem.get('href', if reached and elem.tag == XHTML('a') and elem.get('href',
False): False):
@ -453,24 +454,32 @@ class Mobi8Reader(object):
href, frag = urldefrag(href) href, frag = urldefrag(href)
href = base_href + '/' + href href = base_href + '/' + href
text = xml2text(elem).strip() text = xml2text(elem).strip()
if text in seen: if (text, href, frag) in seen:
continue continue
seen.add(text) seen.add((text, href, frag))
depth = node_depth(elem) links.append((text, href, frag, node_depth(elem)))
elif elem is start:
reached = True
depths = sorted(set(x[-1] for x in links))
depth_map = {x:i for i, x in enumerate(depths)}
for text, href, frag, depth in links:
depth = depth_map[depth]
if current_depth is None: if current_depth is None:
current_depth = depth current_depth = 0
if current_depth == depth: parent.add_item(href, frag, text)
elif current_depth == depth:
parent.add_item(href, frag, text) parent.add_item(href, frag, text)
elif current_depth < depth: elif current_depth < depth:
parent = parent[-1] parent = parent[-1] if len(parent) > 0 else parent
parent.add_item(href, frag, text) parent.add_item(href, frag, text)
current_depth = depth current_depth += 1
else: else:
delta = current_depth - depth
while delta > 0 and parent.parent is not None:
parent = parent.parent parent = parent.parent
delta -= 1
parent.add_item(href, frag, text) parent.add_item(href, frag, text)
current_depth = depth current_depth = depth
else:
if elem is start:
reached = True
return ans return ans