From 0d9047cfa3fd53d60aff325e6f5fd3dc404a42b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 29 Jun 2019 12:16:42 +0530 Subject: [PATCH] DOCX Input: When autodetecting the Table of Contents from headings, work even if the headings use numbering. Fixes #1834661 [Private bug](https://bugs.launchpad.net/calibre/+bug/1834661) --- src/calibre/ebooks/docx/to_html.py | 1 + src/calibre/ebooks/docx/toc.py | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 42b55020e1..c56c78f8c9 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -497,6 +497,7 @@ class Convert(object): if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n + dest.set('data-heading-level', unicode_type(n)) if style.bidi is True: dest.set('dir', 'rtl') diff --git a/src/calibre/ebooks/docx/toc.py b/src/calibre/ebooks/docx/toc.py index a287514e2c..3c607012b1 100644 --- a/src/calibre/ebooks/docx/toc.py +++ b/src/calibre/ebooks/docx/toc.py @@ -15,15 +15,15 @@ from calibre.ebooks.oeb.polish.toc import elem_to_toc_text from polyglot.builtins import iteritems, range -def from_headings(body, log, namespace): +def from_headings(body, log, namespace, num_levels=3): ' Create a TOC from headings in the document ' - XPath, descendants = namespace.XPath, namespace.descendants - headings = ('h1', 'h2', 'h3') tocroot = TOC() - xpaths = [XPath('//%s' % x) for x in headings] - level_prev = {i+1:None for i in range(len(xpaths))} + all_heading_nodes = body.xpath('//*[@data-heading-level]') + level_prev = {i+1:None for i in range(num_levels)} level_prev[0] = tocroot - level_item_map = {i+1:frozenset(xp(body)) for i, xp in enumerate(xpaths)} + level_item_map = {i:frozenset( + x for x in all_heading_nodes if int(x.get('data-heading-level')) == i) + for i in range(1, num_levels+1)} item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems} idcount = count() @@ -35,7 +35,7 @@ def from_headings(body, log, namespace): elem.set('id', ans) return ans - for item in descendants(body, *headings): + for item in all_heading_nodes: lvl = plvl = item_level_map.get(item, None) if lvl is None: continue @@ -48,7 +48,7 @@ def from_headings(body, log, namespace): text = elem_to_toc_text(item) toc = parent.add_item('index.html', elem_id, text) level_prev[lvl] = toc - for i in range(lvl+1, len(xpaths)+1): + for i in range(lvl+1, num_levels+1): level_prev[i] = None if len(tuple(tocroot.flat())) > 1: @@ -136,4 +136,8 @@ def from_toc(docx, link_map, styles, object_map, log, namespace): def create_toc(docx, body, link_map, styles, object_map, log, namespace): - return from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace) + ans = from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace) + # Remove heading level attributes + for h in body.xpath('//*[@data-heading-level]'): + del h.attrib['data-heading-level'] + return ans