ToC wizard: Fix generating toc from headings/xpath yielding unexpected results when tags are present out of sequence

2025-07-09 03:04:10 -04:00 · 2016-08-25 19:56:52 +05:30 · 2016-08-25 19:56:52 +05:30 · 3347f5b011
commit 3347f5b011
parent 7152a09ee2
2 changed files with 46 additions and 13 deletions
--- a/src/calibre/ebooks/oeb/polish/tests/structure.py
+++ b/src/calibre/ebooks/oeb/polish/tests/structure.py
@ -15,7 +15,7 @@ from calibre.ebooks.oeb.polish.create import create_book
 from calibre.ebooks.oeb.polish.cover import (
    find_cover_image, mark_as_cover, find_cover_page, mark_as_titlepage, clean_opf
 )
-from calibre.ebooks.oeb.polish.toc import get_toc
+from calibre.ebooks.oeb.polish.toc import get_toc, from_xpaths as toc_from_xpaths
 from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.ebooks.oeb.base import OEB_DOCS
 from calibre.ebooks.metadata.book.base import Metadata
@ -95,6 +95,29 @@ class Structure(BaseTest):
        self.assertTrue(len(toc))
        self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')
        def tfx(linear, expected):
            items = ['<t{0}>{0}</t{0}>'.format(x) for x in linear]
            html = '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">'
            html += '<body>%s</body></html>' % '\n'.join(items)
            with c.open('nav.html', 'wb') as f:
                f.write(html.encode('utf-8'))
            toc = toc_from_xpaths(c, ['//h:t'+x for x in sorted(set(linear))])
            def p(node):
                ans = ''
                if node.children:
                    ans += '['
                    for c in node.children:
                        ans += c.title + p(c)
                    ans += ']'
                return ans
            self.assertEqual('[%s]'%expected, p(toc))
        tfx('121333', '1[2]1[333]')
        tfx('1223424', '1[22[3[4]]2[4]]')
        tfx('32123', '321[2[3]]')
        tfx('123123', '1[2[3]]1[2[3]]')
    def test_epub3_covers(self):
        # cover image
        ce = partial(self.create_epub, ver=3)
--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@ -85,6 +85,10 @@ class TOC(object):
        except ValueError:
            return 1
    @property
    def last_child(self):
        return self.children[-1] if self.children else None
    def get_lines(self, lvl=0):
        frag = ('#'+self.frag) if self.frag else ''
        ans = [(u'\t'*lvl) + u'TOC: %s --> %s%s'%(self.title, self.dest, frag)]
@ -315,8 +319,6 @@ def from_xpaths(container, xpaths):
    '''
    tocroot = TOC()
    xpaths = [XPath(xp) for xp in xpaths]
    level_prev = {i+1:None for i in xrange(len(xpaths))}
    level_prev[0] = tocroot
    # Find those levels that have no elements in all spine items
    maps = OrderedDict()
@ -336,31 +338,39 @@ def from_xpaths(container, xpaths):
            lmap = {i+1:items for i, (l, items) in enumerate(lmap)}
            maps[name] = lmap
    node_level_map = {tocroot: 0}
    def parent_for_level(child_level):
        limit = child_level - 1
        def process_node(node):
            child = node.last_child
            if child is None:
                return node
            lvl = node_level_map[child]
            return node if lvl > limit else child if lvl == limit else process_node(child)
        return process_node(tocroot)
    for name, level_item_map in maps.iteritems():
        root = container.parsed(name)
        item_level_map = {e:i for i, elems in level_item_map.iteritems() for e in elems}
        item_dirtied = False
        for item in root.iterdescendants(etree.Element):
-            lvl = plvl = item_level_map.get(item, None)
+            lvl = item_level_map.get(item, None)
            if lvl is None:
                continue
-            parent = None
+            text = elem_to_toc_text(item)
-            while parent is None:
+            parent = parent_for_level(lvl)
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            if item_at_top(item):
                dirtied, elem_id = False, None
            else:
                dirtied, elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            item_dirtied = dirtied or item_dirtied
            toc = parent.add(text, name, elem_id)
            node_level_map[toc] = lvl
            toc.dest_exists = True
            level_prev[lvl] = toc
            for i in xrange(lvl+1, len(xpaths)+1):
                level_prev[i] = None
        if item_dirtied:
            container.commit_item(name, keep_parsed=True)