DOCX Input: Support for Word created ToC

DOCX Input: Support for Table of Contents created using the Word Table of Contents tool. calibre now first looks for such a Table of Contents and only if one is not found does it generate a ToC from headings.
2025-07-09 03:04:10 -04:00 · 2013-06-15 18:44:58 +05:30 · 2013-06-15 18:44:58 +05:30 · 8c261063b4
commit 8c261063b4
parent e444f27de8
2 changed files with 87 additions and 5 deletions
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -171,7 +171,7 @@ class Convert(object):
        self.log.debug('Cleaning up redundant markup generated by Word')
        cleanup_markup(self.html, self.styles)

-        return self.write()
+        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
@ -266,8 +266,8 @@ class Convert(object):

        self.styles.resolve_numbering(numbering)

-    def write(self):
-        toc = create_toc(self.body)
+    def write(self, doc):
+        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map)
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
@ -367,11 +367,13 @@ class Convert(object):
        return wrapper

    def resolve_links(self, relationships_by_id):
+        self.resolved_link_map = {}
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
+            self.resolved_link_map[hyperlink] = span
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
--- a/src/calibre/ebooks/docx/toc.py
+++ b/src/calibre/ebooks/docx/toc.py
@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

-from calibre.ebooks.docx.names import XPath, descendants
+from collections import namedtuple
+
+from lxml.etree import tostring
+
+from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.oeb.polish.toc import elem_to_toc_text

@ -17,7 +21,7 @@ class Count(object):
    def __init__(self):
        self.val = 0

-def create_toc(body):
+def from_headings(body):
    ' Create a TOC from headings in the document '
    headings = ('h1', 'h2', 'h3')
    tocroot = TOC()
@ -56,5 +60,81 @@ def create_toc(body):
    if len(tuple(tocroot.flat())) > 1:
        return tocroot

+def structure_toc(entries):
+    indent_vals = sorted({x.indent for x in entries})
+    last_found = [None for i in indent_vals]
+    newtoc = TOC()
+
+    if len(indent_vals) > 6:
+        for x in entries:
+            newtoc.add_item('index.html', x.anchor, x.text)
+        return newtoc
+
+    def find_parent(level):
+        candidates = last_found[:level]
+        for x in reversed(candidates):
+            if x is not None:
+                return x
+        return newtoc
+
+    for item in entries:
+        level = indent_vals.index(item.indent)
+        parent = find_parent(level)
+        last_found[level] = parent.add_item('index.html', item.anchor,
+                    item.text)
+        for i in xrange(level+1, len(last_found)):
+            last_found[i] = None
+
+    return newtoc
+
+def link_to_txt(a, styles, object_map):
+    if len(a) > 1:
+        for child in a:
+            run = object_map.get(child, None)
+            if run is not None:
+                rs = styles.resolve(run)
+                if rs.css.get('display', None) == 'none':
+                    a.remove(child)
+
+    return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
+
+def from_toc(docx, link_map, styles, object_map):
+    toc_level = None
+    level = 0
+    TI = namedtuple('TI', 'text anchor indent')
+    toc = []
+    for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
+        n = tag.tag.rpartition('}')[-1]
+        if n == 'fldChar':
+            t = get(tag, 'w:fldCharType')
+            if t == 'begin':
+                level += 1
+            elif t == 'end':
+                level -= 1
+                if toc_level is not None and level < toc_level:
+                    break
+        elif n == 'instrText':
+            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
+                toc_level = level
+        elif n == 'hyperlink':
+            if toc_level is not None and level >= toc_level and tag in link_map:
+                a = link_map[tag]
+                href = a.get('href', None)
+                txt = link_to_txt(a, styles, object_map)
+                p = ancestor(tag, 'w:p')
+                if txt and href and p is not None:
+                    ps = styles.resolve_paragraph(p)
+                    try:
+                        ml = int(ps.margin_left[:-2])
+                    except (TypeError, ValueError, AttributeError):
+                        ml = 0
+                    if ps.text_align in {'center', 'right'}:
+                        ml = 0
+                    toc.append(TI(txt, href[1:], ml))
+    if toc:
+        return structure_toc(toc)
+
+def create_toc(docx, body, link_map, styles, object_map):
+    return from_toc(docx, link_map, styles, object_map) or from_headings(body)