DOCX Input: Support for Word created ToC

DOCX Input: Support for Table of Contents created using the Word Table
of Contents tool. calibre now first looks for such a Table of Contents
and only if one is not found does it generate a ToC from headings.
This commit is contained in:
Kovid Goyal 2013-06-15 18:44:58 +05:30
parent e444f27de8
commit 8c261063b4
2 changed files with 87 additions and 5 deletions

View File

@ -171,7 +171,7 @@ class Convert(object):
self.log.debug('Cleaning up redundant markup generated by Word')
cleanup_markup(self.html, self.styles)
return self.write()
return self.write(doc)
def read_page_properties(self, doc):
current = []
@ -266,8 +266,8 @@ class Convert(object):
self.styles.resolve_numbering(numbering)
def write(self):
toc = create_toc(self.body)
def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
@ -367,11 +367,13 @@ class Convert(object):
return wrapper
def resolve_links(self, relationships_by_id):
self.resolved_link_map = {}
for hyperlink, spans in self.link_map.iteritems():
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
self.resolved_link_map[hyperlink] = span
tgt = get(hyperlink, 'w:tgtFrame')
if tgt:
span.set('target', tgt)

View File

@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath, descendants
from collections import namedtuple
from lxml.etree import tostring
from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -17,7 +21,7 @@ class Count(object):
def __init__(self):
self.val = 0
def create_toc(body):
def from_headings(body):
' Create a TOC from headings in the document '
headings = ('h1', 'h2', 'h3')
tocroot = TOC()
@ -56,5 +60,81 @@ def create_toc(body):
if len(tuple(tocroot.flat())) > 1:
return tocroot
def structure_toc(entries):
indent_vals = sorted({x.indent for x in entries})
last_found = [None for i in indent_vals]
newtoc = TOC()
if len(indent_vals) > 6:
for x in entries:
newtoc.add_item('index.html', x.anchor, x.text)
return newtoc
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in entries:
level = indent_vals.index(item.indent)
parent = find_parent(level)
last_found[level] = parent.add_item('index.html', item.anchor,
item.text)
for i in xrange(level+1, len(last_found)):
last_found[i] = None
return newtoc
def link_to_txt(a, styles, object_map):
if len(a) > 1:
for child in a:
run = object_map.get(child, None)
if run is not None:
rs = styles.resolve(run)
if rs.css.get('display', None) == 'none':
a.remove(child)
return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
def from_toc(docx, link_map, styles, object_map):
toc_level = None
level = 0
TI = namedtuple('TI', 'text anchor indent')
toc = []
for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
n = tag.tag.rpartition('}')[-1]
if n == 'fldChar':
t = get(tag, 'w:fldCharType')
if t == 'begin':
level += 1
elif t == 'end':
level -= 1
if toc_level is not None and level < toc_level:
break
elif n == 'instrText':
if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
toc_level = level
elif n == 'hyperlink':
if toc_level is not None and level >= toc_level and tag in link_map:
a = link_map[tag]
href = a.get('href', None)
txt = link_to_txt(a, styles, object_map)
p = ancestor(tag, 'w:p')
if txt and href and p is not None:
ps = styles.resolve_paragraph(p)
try:
ml = int(ps.margin_left[:-2])
except (TypeError, ValueError, AttributeError):
ml = 0
if ps.text_align in {'center', 'right'}:
ml = 0
toc.append(TI(txt, href[1:], ml))
if toc:
return structure_toc(toc)
def create_toc(docx, body, link_map, styles, object_map):
return from_toc(docx, link_map, styles, object_map) or from_headings(body)