DOCX: Support for footnotes and endnotes

This commit is contained in:
Kovid Goyal 2013-05-23 14:41:11 +05:30
parent 997dcae358
commit 3566c2e5cb
4 changed files with 116 additions and 6 deletions

View File

@ -0,0 +1,62 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.names import get, XPath, descendants
class Note(object):
def __init__(self, parent):
self.type = get(parent, 'w:type', 'normal')
self.parent = parent
def __iter__(self):
for p in descendants(self.parent, 'w:p'):
yield p
class Footnotes(object):
def __init__(self):
self.footnotes = {}
self.endnotes = {}
self.counter = 0
self.notes = OrderedDict()
def __call__(self, footnotes, endnotes):
if footnotes is not None:
for footnote in XPath('./w:footnote[@w:id]')(footnotes):
fid = get(footnote, 'w:id')
if fid:
self.footnotes[fid] = Note(footnote)
if endnotes is not None:
for endnote in XPath('./w:endnote[@w:id]')(endnotes):
fid = get(endnote, 'w:id')
if fid:
self.endnotes[fid] = Note(endnote)
def get_ref(self, ref):
fid = get(ref, 'w:id')
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
note = notes.get(fid, None)
if note is not None and note.type == 'normal':
self.counter += 1
anchor = 'note_%d' % self.counter
self.notes[anchor] = (type('')(self.counter), note)
return anchor, type('')(self.counter)
return None, None
def __iter__(self):
for anchor, (counter, note) in self.notes.iteritems():
yield anchor, counter, note
@property
def has_notes(self):
return bool(self.notes)

View File

@ -21,6 +21,8 @@ NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'
FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes'
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -384,6 +384,16 @@ class Styles(object):
p { text-indent: 1.5em }
ul, ol, p { margin: 0; padding: 0 }
sup.noteref a { text-decoration: none }
h1.notes-header { page-break-before: always }
dl.notes dt { font-size: large }
dl.notes dt a { text-decoration: none }
dl.notes dd { page-break-after: always }
''') % (self.body_font_family, self.body_font_size)
if ef:
prefix = ef + '\n' + prefix

View File

@ -11,16 +11,17 @@ from collections import OrderedDict, defaultdict
from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, HR)
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants, ancestor)
descendants, ancestor, FOOTNOTES, ENDNOTES)
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text:
@ -34,9 +35,10 @@ class Text:
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None):
def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
self.docx = DOCX(path_or_stream, log=log)
self.log = self.docx.log
self.notes_text = notes_text or _('Notes')
self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata
self.body = BODY()
@ -81,6 +83,20 @@ class Convert(object):
p = self.convert_p(wp)
self.body.append(p)
if self.footnotes.has_notes:
dl = DL()
dl.set('class', 'notes')
self.body.append(H1(self.notes_text))
self.body[-1].set('class', 'notes-header')
self.body.append(dl)
for anchor, text, note in self.footnotes:
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text), id=anchor))
dl[-1][0].tail = ']'
dl.append(DD())
for wp in note:
p = self.convert_p(wp)
dl[-1].append(p)
self.resolve_links(relationships_by_id)
# TODO: tables <w:tbl> child of <w:body> (nested tables?)
@ -154,9 +170,25 @@ class Convert(object):
nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml')
fname = get_name(FONTS, 'fontTable.xml')
foname = get_name(FOOTNOTES, 'footnotes.xml')
enname = get_name(ENDNOTES, 'endnotes.xml')
numbering = self.numbering = Numbering()
footnotes = self.footnotes = Footnotes()
fonts = self.fonts = Fonts()
foraw = enraw = None
if foname is not None:
try:
foraw = self.docx.read(foname)
except KeyError:
self.log.warn('Footnotes %s do not exist' % foname)
if enname is not None:
try:
enraw = self.docx.read(enname)
except KeyError:
self.log.warn('Endnotes %s do not exist' % enname)
footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
try:
@ -327,9 +359,13 @@ class Convert(object):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
elif is_tag(child, 'w:continuationSeparator'):
text.add_elem(HR())
ans.append(text.elem)
elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))