DOCX Input: Support for Word created ToC

DOCX Input: Support for Table of Contents created using the Word Table of Contents tool. calibre now first looks for such a Table of Contents and only if one is not found does it generate a ToC from headings.
2025-07-09 03:04:10 -04:00 · 2013-06-15 18:44:58 +05:30 · 2013-06-15 18:44:58 +05:30 · 8c261063b4
commit 8c261063b4
parent e444f27de8
2 changed files with 87 additions and 5 deletions
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -171,7 +171,7 @@ class Convert(object):
        self.log.debug('Cleaning up redundant markup generated by Word')
        cleanup_markup(self.html, self.styles)
-        return self.write()
+        return self.write(doc)
    def read_page_properties(self, doc):
        current = []
@ -266,8 +266,8 @@ class Convert(object):
        self.styles.resolve_numbering(numbering)
-    def write(self):
+    def write(self, doc):
-        toc = create_toc(self.body)
+        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map)
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
@ -367,11 +367,13 @@ class Convert(object):
        return wrapper
    def resolve_links(self, relationships_by_id):
        self.resolved_link_map = {}
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
--- a/src/calibre/ebooks/docx/toc.py
+++ b/src/calibre/ebooks/docx/toc.py
@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
-from calibre.ebooks.docx.names import XPath, descendants
+from collections import namedtuple
 from lxml.etree import tostring
 from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -17,7 +21,7 @@ class Count(object):
    def __init__(self):
        self.val = 0
-def create_toc(body):
+def from_headings(body):
    ' Create a TOC from headings in the document '
    headings = ('h1', 'h2', 'h3')
    tocroot = TOC()
@ -56,5 +60,81 @@ def create_toc(body):
    if len(tuple(tocroot.flat())) > 1:
        return tocroot
 def structure_toc(entries):
    indent_vals = sorted({x.indent for x in entries})
    last_found = [None for i in indent_vals]
    newtoc = TOC()
    if len(indent_vals) > 6:
        for x in entries:
            newtoc.add_item('index.html', x.anchor, x.text)
        return newtoc
    def find_parent(level):
        candidates = last_found[:level]
        for x in reversed(candidates):
            if x is not None:
                return x
        return newtoc
    for item in entries:
        level = indent_vals.index(item.indent)
        parent = find_parent(level)
        last_found[level] = parent.add_item('index.html', item.anchor,
                    item.text)
        for i in xrange(level+1, len(last_found)):
            last_found[i] = None
    return newtoc
 def link_to_txt(a, styles, object_map):
    if len(a) > 1:
        for child in a:
            run = object_map.get(child, None)
            if run is not None:
                rs = styles.resolve(run)
                if rs.css.get('display', None) == 'none':
                    a.remove(child)
    return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
 def from_toc(docx, link_map, styles, object_map):
    toc_level = None
    level = 0
    TI = namedtuple('TI', 'text anchor indent')
    toc = []
    for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
        n = tag.tag.rpartition('}')[-1]
        if n == 'fldChar':
            t = get(tag, 'w:fldCharType')
            if t == 'begin':
                level += 1
            elif t == 'end':
                level -= 1
                if toc_level is not None and level < toc_level:
                    break
        elif n == 'instrText':
            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
                toc_level = level
        elif n == 'hyperlink':
            if toc_level is not None and level >= toc_level and tag in link_map:
                a = link_map[tag]
                href = a.get('href', None)
                txt = link_to_txt(a, styles, object_map)
                p = ancestor(tag, 'w:p')
                if txt and href and p is not None:
                    ps = styles.resolve_paragraph(p)
                    try:
                        ml = int(ps.margin_left[:-2])
                    except (TypeError, ValueError, AttributeError):
                        ml = 0
                    if ps.text_align in {'center', 'right'}:
                        ml = 0
                    toc.append(TI(txt, href[1:], ml))
    if toc:
        return structure_toc(toc)
 def create_toc(docx, body, link_map, styles, object_map):
    return from_toc(docx, link_map, styles, object_map) or from_headings(body)