From 8c261063b484128bc2d4333d9fa7eef1a97a84e2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 15 Jun 2013 18:44:58 +0530 Subject: [PATCH] DOCX Input: Support for Word created ToC DOCX Input: Support for Table of Contents created using the Word Table of Contents tool. calibre now first looks for such a Table of Contents and only if one is not found does it generate a ToC from headings. --- src/calibre/ebooks/docx/to_html.py | 8 +-- src/calibre/ebooks/docx/toc.py | 84 +++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 6fd0026874..c3b2391d3f 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -171,7 +171,7 @@ class Convert(object): self.log.debug('Cleaning up redundant markup generated by Word') cleanup_markup(self.html, self.styles) - return self.write() + return self.write(doc) def read_page_properties(self, doc): current = [] @@ -266,8 +266,8 @@ class Convert(object): self.styles.resolve_numbering(numbering) - def write(self): - toc = create_toc(self.body) + def write(self, doc): + toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map) raw = html.tostring(self.html, encoding='utf-8', doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: f.write(raw) @@ -367,11 +367,13 @@ class Convert(object): return wrapper def resolve_links(self, relationships_by_id): + self.resolved_link_map = {} for hyperlink, spans in self.link_map.iteritems(): span = spans[0] if len(spans) > 1: span = self.wrap_elems(spans, SPAN()) span.tag = 'a' + self.resolved_link_map[hyperlink] = span tgt = get(hyperlink, 'w:tgtFrame') if tgt: span.set('target', tgt) diff --git a/src/calibre/ebooks/docx/toc.py b/src/calibre/ebooks/docx/toc.py index 8036808701..5936d34355 100644 --- a/src/calibre/ebooks/docx/toc.py +++ b/src/calibre/ebooks/docx/toc.py @@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -from calibre.ebooks.docx.names import XPath, descendants +from collections import namedtuple + +from lxml.etree import tostring + +from calibre.ebooks.docx.names import XPath, descendants, get, ancestor from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.oeb.polish.toc import elem_to_toc_text @@ -17,7 +21,7 @@ class Count(object): def __init__(self): self.val = 0 -def create_toc(body): +def from_headings(body): ' Create a TOC from headings in the document ' headings = ('h1', 'h2', 'h3') tocroot = TOC() @@ -56,5 +60,81 @@ def create_toc(body): if len(tuple(tocroot.flat())) > 1: return tocroot +def structure_toc(entries): + indent_vals = sorted({x.indent for x in entries}) + last_found = [None for i in indent_vals] + newtoc = TOC() + + if len(indent_vals) > 6: + for x in entries: + newtoc.add_item('index.html', x.anchor, x.text) + return newtoc + + def find_parent(level): + candidates = last_found[:level] + for x in reversed(candidates): + if x is not None: + return x + return newtoc + + for item in entries: + level = indent_vals.index(item.indent) + parent = find_parent(level) + last_found[level] = parent.add_item('index.html', item.anchor, + item.text) + for i in xrange(level+1, len(last_found)): + last_found[i] = None + + return newtoc + +def link_to_txt(a, styles, object_map): + if len(a) > 1: + for child in a: + run = object_map.get(child, None) + if run is not None: + rs = styles.resolve(run) + if rs.css.get('display', None) == 'none': + a.remove(child) + + return tostring(a, method='text', with_tail=False, encoding=unicode).strip() + +def from_toc(docx, link_map, styles, object_map): + toc_level = None + level = 0 + TI = namedtuple('TI', 'text anchor indent') + toc = [] + for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx): + n = tag.tag.rpartition('}')[-1] + if n == 'fldChar': + t = get(tag, 'w:fldCharType') + if t == 'begin': + level += 1 + elif t == 'end': + level -= 1 + if toc_level is not None and level < toc_level: + break + elif n == 'instrText': + if level > 0 and tag.text and tag.text.strip().startswith('TOC '): + toc_level = level + elif n == 'hyperlink': + if toc_level is not None and level >= toc_level and tag in link_map: + a = link_map[tag] + href = a.get('href', None) + txt = link_to_txt(a, styles, object_map) + p = ancestor(tag, 'w:p') + if txt and href and p is not None: + ps = styles.resolve_paragraph(p) + try: + ml = int(ps.margin_left[:-2]) + except (TypeError, ValueError, AttributeError): + ml = 0 + if ps.text_align in {'center', 'right'}: + ml = 0 + toc.append(TI(txt, href[1:], ml)) + if toc: + return structure_toc(toc) + +def create_toc(docx, body, link_map, styles, object_map): + return from_toc(docx, link_map, styles, object_map) or from_headings(body)