DOCX: Support for footnotes and endnotes

This commit is contained in:
Kovid Goyal 2013-05-23 14:41:11 +05:30
parent 997dcae358
commit 3566c2e5cb
4 changed files with 116 additions and 6 deletions

View File

@ -0,0 +1,62 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.names import get, XPath, descendants
class Note(object):
def __init__(self, parent):
self.type = get(parent, 'w:type', 'normal')
self.parent = parent
def __iter__(self):
for p in descendants(self.parent, 'w:p'):
yield p
class Footnotes(object):
def __init__(self):
self.footnotes = {}
self.endnotes = {}
self.counter = 0
self.notes = OrderedDict()
def __call__(self, footnotes, endnotes):
if footnotes is not None:
for footnote in XPath('./w:footnote[@w:id]')(footnotes):
fid = get(footnote, 'w:id')
if fid:
self.footnotes[fid] = Note(footnote)
if endnotes is not None:
for endnote in XPath('./w:endnote[@w:id]')(endnotes):
fid = get(endnote, 'w:id')
if fid:
self.endnotes[fid] = Note(endnote)
def get_ref(self, ref):
fid = get(ref, 'w:id')
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
note = notes.get(fid, None)
if note is not None and note.type == 'normal':
self.counter += 1
anchor = 'note_%d' % self.counter
self.notes[anchor] = (type('')(self.counter), note)
return anchor, type('')(self.counter)
return None, None
def __iter__(self):
for anchor, (counter, note) in self.notes.iteritems():
yield anchor, counter, note
@property
def has_notes(self):
return bool(self.notes)

View File

@ -21,6 +21,8 @@ NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable' FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink' LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'
FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes'
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
namespaces = { namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -384,6 +384,16 @@ class Styles(object):
p { text-indent: 1.5em } p { text-indent: 1.5em }
ul, ol, p { margin: 0; padding: 0 } ul, ol, p { margin: 0; padding: 0 }
sup.noteref a { text-decoration: none }
h1.notes-header { page-break-before: always }
dl.notes dt { font-size: large }
dl.notes dt a { text-decoration: none }
dl.notes dd { page-break-after: always }
''') % (self.body_font_family, self.body_font_size) ''') % (self.body_font_family, self.body_font_size)
if ef: if ef:
prefix = ef + '\n' + prefix prefix = ef + '\n' + prefix

View File

@ -11,16 +11,17 @@ from collections import OrderedDict, defaultdict
from lxml import html from lxml import html
from lxml.html.builder import ( from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, HR) HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import ( from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants, ancestor) descendants, ancestor, FOOTNOTES, ENDNOTES)
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text: class Text:
@ -34,9 +35,10 @@ class Text:
class Convert(object): class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None): def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
self.docx = DOCX(path_or_stream, log=log) self.docx = DOCX(path_or_stream, log=log)
self.log = self.docx.log self.log = self.docx.log
self.notes_text = notes_text or _('Notes')
self.dest_dir = dest_dir or os.getcwdu() self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata self.mi = self.docx.metadata
self.body = BODY() self.body = BODY()
@ -81,6 +83,20 @@ class Convert(object):
p = self.convert_p(wp) p = self.convert_p(wp)
self.body.append(p) self.body.append(p)
if self.footnotes.has_notes:
dl = DL()
dl.set('class', 'notes')
self.body.append(H1(self.notes_text))
self.body[-1].set('class', 'notes-header')
self.body.append(dl)
for anchor, text, note in self.footnotes:
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text), id=anchor))
dl[-1][0].tail = ']'
dl.append(DD())
for wp in note:
p = self.convert_p(wp)
dl[-1].append(p)
self.resolve_links(relationships_by_id) self.resolve_links(relationships_by_id)
# TODO: tables <w:tbl> child of <w:body> (nested tables?) # TODO: tables <w:tbl> child of <w:body> (nested tables?)
@ -154,9 +170,25 @@ class Convert(object):
nname = get_name(NUMBERING, 'numbering.xml') nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml') sname = get_name(STYLES, 'styles.xml')
fname = get_name(FONTS, 'fontTable.xml') fname = get_name(FONTS, 'fontTable.xml')
foname = get_name(FOOTNOTES, 'footnotes.xml')
enname = get_name(ENDNOTES, 'endnotes.xml')
numbering = self.numbering = Numbering() numbering = self.numbering = Numbering()
footnotes = self.footnotes = Footnotes()
fonts = self.fonts = Fonts() fonts = self.fonts = Fonts()
foraw = enraw = None
if foname is not None:
try:
foraw = self.docx.read(foname)
except KeyError:
self.log.warn('Footnotes %s do not exist' % foname)
if enname is not None:
try:
enraw = self.docx.read(enname)
except KeyError:
self.log.warn('Endnotes %s do not exist' % enname)
footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)
if fname is not None: if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0] embed_relationships = self.docx.get_relationships(fname)[0]
try: try:
@ -327,9 +359,13 @@ class Convert(object):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img) text.add_elem(img)
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:continuationSeparator'): elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
text.add_elem(HR()) anchor, name = self.footnotes.get_ref(child)
ans.append(text.elem) if anchor and name:
l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
if text.buf: if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf)) setattr(text.elem, text.attr, ''.join(text.buf))