From 8c261063b484128bc2d4333d9fa7eef1a97a84e2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 15 Jun 2013 18:44:58 +0530
Subject: [PATCH] DOCX Input: Support for Word created ToC

DOCX Input: Support for Table of Contents created using the Word Table
of Contents tool. calibre now first looks for such a Table of Contents
and only if one is not found does it generate a ToC from headings.
---
 src/calibre/ebooks/docx/to_html.py |  8 +--
 src/calibre/ebooks/docx/toc.py     | 84 +++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
index 6fd0026874..c3b2391d3f 100644
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@@ -171,7 +171,7 @@ class Convert(object):
         self.log.debug('Cleaning up redundant markup generated by Word')
         cleanup_markup(self.html, self.styles)
 
-        return self.write()
+        return self.write(doc)
 
     def read_page_properties(self, doc):
         current = []
@@ -266,8 +266,8 @@ class Convert(object):
 
         self.styles.resolve_numbering(numbering)
 
-    def write(self):
-        toc = create_toc(self.body)
+    def write(self, doc):
+        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map)
         raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
         with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
             f.write(raw)
@@ -367,11 +367,13 @@ class Convert(object):
         return wrapper
 
     def resolve_links(self, relationships_by_id):
+        self.resolved_link_map = {}
         for hyperlink, spans in self.link_map.iteritems():
             span = spans[0]
             if len(spans) > 1:
                 span = self.wrap_elems(spans, SPAN())
             span.tag = 'a'
+            self.resolved_link_map[hyperlink] = span
             tgt = get(hyperlink, 'w:tgtFrame')
             if tgt:
                 span.set('target', tgt)
diff --git a/src/calibre/ebooks/docx/toc.py b/src/calibre/ebooks/docx/toc.py
index 8036808701..5936d34355 100644
--- a/src/calibre/ebooks/docx/toc.py
+++ b/src/calibre/ebooks/docx/toc.py
@@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
-from calibre.ebooks.docx.names import XPath, descendants
+from collections import namedtuple
+
+from lxml.etree import tostring
+
+from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
 
@@ -17,7 +21,7 @@ class Count(object):
     def __init__(self):
         self.val = 0
 
-def create_toc(body):
+def from_headings(body):
     ' Create a TOC from headings in the document '
     headings = ('h1', 'h2', 'h3')
     tocroot = TOC()
@@ -56,5 +60,81 @@ def create_toc(body):
     if len(tuple(tocroot.flat())) > 1:
         return tocroot
 
+def structure_toc(entries):
+    indent_vals = sorted({x.indent for x in entries})
+    last_found = [None for i in indent_vals]
+    newtoc = TOC()
+
+    if len(indent_vals) > 6:
+        for x in entries:
+            newtoc.add_item('index.html', x.anchor, x.text)
+        return newtoc
+
+    def find_parent(level):
+        candidates = last_found[:level]
+        for x in reversed(candidates):
+            if x is not None:
+                return x
+        return newtoc
+
+    for item in entries:
+        level = indent_vals.index(item.indent)
+        parent = find_parent(level)
+        last_found[level] = parent.add_item('index.html', item.anchor,
+                    item.text)
+        for i in xrange(level+1, len(last_found)):
+            last_found[i] = None
+
+    return newtoc
+
+def link_to_txt(a, styles, object_map):
+    if len(a) > 1:
+        for child in a:
+            run = object_map.get(child, None)
+            if run is not None:
+                rs = styles.resolve(run)
+                if rs.css.get('display', None) == 'none':
+                    a.remove(child)
+
+    return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
+
+def from_toc(docx, link_map, styles, object_map):
+    toc_level = None
+    level = 0
+    TI = namedtuple('TI', 'text anchor indent')
+    toc = []
+    for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
+        n = tag.tag.rpartition('}')[-1]
+        if n == 'fldChar':
+            t = get(tag, 'w:fldCharType')
+            if t == 'begin':
+                level += 1
+            elif t == 'end':
+                level -= 1
+                if toc_level is not None and level < toc_level:
+                    break
+        elif n == 'instrText':
+            if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
+                toc_level = level
+        elif n == 'hyperlink':
+            if toc_level is not None and level >= toc_level and tag in link_map:
+                a = link_map[tag]
+                href = a.get('href', None)
+                txt = link_to_txt(a, styles, object_map)
+                p = ancestor(tag, 'w:p')
+                if txt and href and p is not None:
+                    ps = styles.resolve_paragraph(p)
+                    try:
+                        ml = int(ps.margin_left[:-2])
+                    except (TypeError, ValueError, AttributeError):
+                        ml = 0
+                    if ps.text_align in {'center', 'right'}:
+                        ml = 0
+                    toc.append(TI(txt, href[1:], ml))
+    if toc:
+        return structure_toc(toc)
+
+def create_toc(docx, body, link_map, styles, object_map):
+    return from_toc(docx, link_map, styles, object_map) or from_headings(body)