DOCX Input: Support for Word created ToC

DOCX Input: Support for Table of Contents created using the Word Table
of Contents tool. calibre now first looks for such a Table of Contents
and only if one is not found does it generate a ToC from headings.
This commit is contained in:
Kovid Goyal 2013-06-15 18:44:58 +05:30
parent e444f27de8
commit 8c261063b4
2 changed files with 87 additions and 5 deletions

View File

@ -171,7 +171,7 @@ class Convert(object):
self.log.debug('Cleaning up redundant markup generated by Word') self.log.debug('Cleaning up redundant markup generated by Word')
cleanup_markup(self.html, self.styles) cleanup_markup(self.html, self.styles)
return self.write() return self.write(doc)
def read_page_properties(self, doc): def read_page_properties(self, doc):
current = [] current = []
@ -266,8 +266,8 @@ class Convert(object):
self.styles.resolve_numbering(numbering) self.styles.resolve_numbering(numbering)
def write(self): def write(self, doc):
toc = create_toc(self.body) toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>') raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw) f.write(raw)
@ -367,11 +367,13 @@ class Convert(object):
return wrapper return wrapper
def resolve_links(self, relationships_by_id): def resolve_links(self, relationships_by_id):
self.resolved_link_map = {}
for hyperlink, spans in self.link_map.iteritems(): for hyperlink, spans in self.link_map.iteritems():
span = spans[0] span = spans[0]
if len(spans) > 1: if len(spans) > 1:
span = self.wrap_elems(spans, SPAN()) span = self.wrap_elems(spans, SPAN())
span.tag = 'a' span.tag = 'a'
self.resolved_link_map[hyperlink] = span
tgt = get(hyperlink, 'w:tgtFrame') tgt = get(hyperlink, 'w:tgtFrame')
if tgt: if tgt:
span.set('target', tgt) span.set('target', tgt)

View File

@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath, descendants from collections import namedtuple
from lxml.etree import tostring
from calibre.ebooks.docx.names import XPath, descendants, get, ancestor
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
@ -17,7 +21,7 @@ class Count(object):
def __init__(self): def __init__(self):
self.val = 0 self.val = 0
def create_toc(body): def from_headings(body):
' Create a TOC from headings in the document ' ' Create a TOC from headings in the document '
headings = ('h1', 'h2', 'h3') headings = ('h1', 'h2', 'h3')
tocroot = TOC() tocroot = TOC()
@ -56,5 +60,81 @@ def create_toc(body):
if len(tuple(tocroot.flat())) > 1: if len(tuple(tocroot.flat())) > 1:
return tocroot return tocroot
def structure_toc(entries):
indent_vals = sorted({x.indent for x in entries})
last_found = [None for i in indent_vals]
newtoc = TOC()
if len(indent_vals) > 6:
for x in entries:
newtoc.add_item('index.html', x.anchor, x.text)
return newtoc
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in entries:
level = indent_vals.index(item.indent)
parent = find_parent(level)
last_found[level] = parent.add_item('index.html', item.anchor,
item.text)
for i in xrange(level+1, len(last_found)):
last_found[i] = None
return newtoc
def link_to_txt(a, styles, object_map):
if len(a) > 1:
for child in a:
run = object_map.get(child, None)
if run is not None:
rs = styles.resolve(run)
if rs.css.get('display', None) == 'none':
a.remove(child)
return tostring(a, method='text', with_tail=False, encoding=unicode).strip()
def from_toc(docx, link_map, styles, object_map):
toc_level = None
level = 0
TI = namedtuple('TI', 'text anchor indent')
toc = []
for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
n = tag.tag.rpartition('}')[-1]
if n == 'fldChar':
t = get(tag, 'w:fldCharType')
if t == 'begin':
level += 1
elif t == 'end':
level -= 1
if toc_level is not None and level < toc_level:
break
elif n == 'instrText':
if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
toc_level = level
elif n == 'hyperlink':
if toc_level is not None and level >= toc_level and tag in link_map:
a = link_map[tag]
href = a.get('href', None)
txt = link_to_txt(a, styles, object_map)
p = ancestor(tag, 'w:p')
if txt and href and p is not None:
ps = styles.resolve_paragraph(p)
try:
ml = int(ps.margin_left[:-2])
except (TypeError, ValueError, AttributeError):
ml = 0
if ps.text_align in {'center', 'right'}:
ml = 0
toc.append(TI(txt, href[1:], ml))
if toc:
return structure_toc(toc)
def create_toc(docx, body, link_map, styles, object_map):
return from_toc(docx, link_map, styles, object_map) or from_headings(body)