DOCX: Support for footnotes and endnotes

2025-07-09 03:04:10 -04:00 · 2013-05-23 14:41:11 +05:30 · 2013-05-23 14:41:11 +05:30 · 3566c2e5cb
commit 3566c2e5cb
parent 997dcae358
4 changed files with 116 additions and 6 deletions
--- a/src/calibre/ebooks/docx/footnotes.py
+++ b/src/calibre/ebooks/docx/footnotes.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 from collections import OrderedDict
 from calibre.ebooks.docx.names import get, XPath, descendants
 class Note(object):
    def __init__(self, parent):
        self.type = get(parent, 'w:type', 'normal')
        self.parent = parent
    def __iter__(self):
        for p in descendants(self.parent, 'w:p'):
            yield p
 class Footnotes(object):
    def __init__(self):
        self.footnotes = {}
        self.endnotes = {}
        self.counter = 0
        self.notes = OrderedDict()
    def __call__(self, footnotes, endnotes):
        if footnotes is not None:
            for footnote in XPath('./w:footnote[@w:id]')(footnotes):
                fid = get(footnote, 'w:id')
                if fid:
                    self.footnotes[fid] = Note(footnote)
        if endnotes is not None:
            for endnote in XPath('./w:endnote[@w:id]')(endnotes):
                fid = get(endnote, 'w:id')
                if fid:
                    self.endnotes[fid] = Note(endnote)
    def get_ref(self, ref):
        fid = get(ref, 'w:id')
        notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
        note = notes.get(fid, None)
        if note is not None and note.type == 'normal':
            self.counter += 1
            anchor = 'note_%d' % self.counter
            self.notes[anchor] = (type('')(self.counter), note)
            return anchor, type('')(self.counter)
        return None, None
    def __iter__(self):
        for anchor, (counter, note) in self.notes.iteritems():
            yield anchor, counter, note
    @property
    def has_notes(self):
        return bool(self.notes)
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -21,6 +21,8 @@ NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
 FONTS     = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
 IMAGES    = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
 LINKS     = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'
 FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes'
 ENDNOTES  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
 namespaces = {
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@ -384,6 +384,16 @@ class Styles(object):
            p { text-indent: 1.5em }
            ul, ol, p { margin: 0; padding: 0 }
            sup.noteref a { text-decoration: none }
            h1.notes-header { page-break-before: always }
            dl.notes dt { font-size: large }
            dl.notes dt a { text-decoration: none }
            dl.notes dd { page-break-after: always }
            ''') % (self.body_font_family, self.body_font_size)
        if ef:
            prefix = ef + '\n' + prefix
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -11,16 +11,17 @@ from collections import OrderedDict, defaultdict
 from lxml import html
 from lxml.html.builder import (
-    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, HR)
+    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
 from calibre.ebooks.docx.container import DOCX, fromstring
 from calibre.ebooks.docx.names import (
    XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
-    descendants, ancestor)
+    descendants, ancestor, FOOTNOTES, ENDNOTES)
 from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
 from calibre.ebooks.docx.numbering import Numbering
 from calibre.ebooks.docx.fonts import Fonts
 from calibre.ebooks.docx.images import Images
 from calibre.ebooks.docx.footnotes import Footnotes
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
 class Text:
@ -34,9 +35,10 @@ class Text:
 class Convert(object):
-    def __init__(self, path_or_stream, dest_dir=None, log=None):
+    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.log = self.docx.log
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
@ -81,6 +83,20 @@ class Convert(object):
            p = self.convert_p(wp)
            self.body.append(p)
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            self.body[-1].set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                for wp in note:
                    p = self.convert_p(wp)
                    dl[-1].append(p)
        self.resolve_links(relationships_by_id)
        # TODO: tables <w:tbl> child of <w:body> (nested tables?)
@ -154,9 +170,25 @@ class Convert(object):
        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()
        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
        footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)
        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
@ -327,9 +359,13 @@ class Convert(object):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
-            elif is_tag(child, 'w:continuationSeparator'):
+            elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
-                text.add_elem(HR())
+                anchor, name = self.footnotes.get_ref(child)
-                ans.append(text.elem)
+                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))