This material adds the option to generate an alphabetical index from index markers in a docx file.

2025-07-09 03:04:10 -04:00 · 2014-03-25 15:26:20 -07:00 · 2014-03-25 15:26:20 -07:00 · f790038819
commit f790038819
parent 290462909f
7 changed files with 461 additions and 4 deletions
--- a/src/calibre/ebooks/conversion/plugins/docx_input.py
+++ b/src/calibre/ebooks/conversion/plugins/docx_input.py
@ -19,6 +19,9 @@ class DOCXInput(InputFormatPlugin):
            help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
                   'it will be removed from the document and used as the cover for created ebook. This option '
                   'turns off that behavior.')),
+        OptionRecommendation(name='docx_index', recommended_value=False,
+            help=_('If there are embedded index markers in the document, this option will use them to create '
+                   'an alphabetical index with links to the locations of the markers.')),

    }

@ -26,5 +29,5 @@ class DOCXInput(InputFormatPlugin):

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.docx.to_html import Convert
-        return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()
+        return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)()

--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@ -22,7 +22,8 @@ from calibre.utils.zipfile import ZipFile
 from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER

 def fromstring(raw, parser=RECOVER_PARSER):
-    return etree.fromstring(raw, parser=parser)
+    res = etree.fromstring(raw, parser=parser)
+    return res

 # Read metadata {{{
 def read_doc_props(raw, mi):
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -10,12 +10,15 @@ import re

 from calibre.ebooks.docx.names import XPath, get

+import sys
+
 class Field(object):

    def __init__(self, start):
        self.start = start
        self.end = None
        self.contents = []
+        self.elements = []
        self.instructions = []

    def add_instr(self, elem):
@ -24,6 +27,7 @@ class Field(object):
            return
        name, rest = raw.strip().partition(' ')[0::2]
        self.instructions.append((name, rest.strip()))
+        self.elements.append(elem)

 WORD, FLAG = 0, 1
 scanner = re.Scanner([
--- a/src/calibre/ebooks/docx/index.py
+++ b/src/calibre/ebooks/docx/index.py
@ -0,0 +1,427 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import itertools
+from collections import OrderedDict
+from lxml import html
+from lxml.html.builder import (
+    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
+from calibre.ebooks.docx.names import (
+    XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
+    ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
+
+import lxml.etree
+
+NBSP = '\xa0'
+
+class Location:
+    """
+    This class represents one location in the index.
+    We should provide a way to mark the main entries. Libre office
+    has a main attribute, which doesn't seem to map to docx, and at least
+    some versions of word can mark entries bold or italic with \b and \i.
+    One index entry corresponds to a list of locations where the entry
+    is referenced in the text.
+    """
+
+    def __init__(self, bookmark, target):
+        self.bookmark = bookmark
+        self.target = target
+
+class Entry:
+    """
+    This class represents one index entry.
+    We can also have a list of subentries for the primary/secondary
+    topic situation.
+    Each entry has a list of locations we want to point to, but
+    it could be empty if this is only here to organize subentries.
+    """
+
+    def __init__(self, name, index):
+        self.subentries = {}
+        self.locations = []
+        self.name = name
+        self.index = index
+    
+    def addEntry(self, entry, sub):
+        """
+        The entry has the form [xxx, field, bookmark, target]
+        """
+        if len(sub) == 0:
+            self.locations.append(Location(entry[2], entry[3]))
+        else:
+            sube = Index.findEntry(sub[0], self.subentries, self.index)
+            sube.addEntry(entry, sub[1:])
+
+    def makeLink(self, loc, amap):
+        # As a first pass, we just put a placeholder in the target location
+        # We want it to float right
+        markid = amap[loc.bookmark]
+        if markid == None:
+            return
+
+        span = A()
+        span.set('style', 'float:right')
+        span.set('href', '#' + markid)
+        from calibre.ebooks.docx.to_html import Text
+        text = Text(span, 'text', [])
+        text.buf.append(loc.target)
+        setattr(text.elem, text.attr, ''.join(text.buf))
+        return span
+
+    def toHtmlUnit(self, body, level, amap):
+        """
+        Append the material for one index entry to the document.
+        There is a name, and 0 or more locations.
+        Put the first location, if any, on the same line as the
+        name, and others on following lines.
+        """
+        style = self.index.entryStyles[level]
+        main = Index.addName(self.name, style)
+        if len(self.locations) == 0:
+            body.append(main)
+            return
+
+        # First link on same line as name
+        link = self.makeLink(self.locations[0], amap)
+        main.append(link)
+        body.append(main)
+
+        # Put other links for same entry on their own lines
+        # To keep the link span separate need to put a space as the name
+        for l in self.locations[1:]:
+            link = self.makeLink(l, amap)
+            dest = P()
+            dest.set('class', style)
+            dest.text = NBSP
+            dest.append(link)
+            body.append(dest)
+
+    def toHtml(self, body, level, amap):
+        level = min(level, 2)
+        self.toHtmlUnit(body, level, amap)
+        for key in sorted(self.subentries.keys()):
+            self.subentries[key].toHtml(body, level + 1, amap)
+
+class Section:
+    """
+    This class represents one section of the index - usually,
+    for example, the A's or the B's.
+    It is primarily a dictionary of entries.
+    """
+
+    def __init__(self, index):
+        self.index = index
+        self.entries = {}
+
+    def addEntry(self, entry):
+        """
+        We have information from one index marker.
+        The entry has form [name, field, bookmark, target].
+        The name is something like A or A:B and so on.
+        If we already have an entry for that name, just add the new
+        location to it; otherwise create a new entry.
+        """
+        topics = entry[0].strip('"').split(':')
+        targ = Index.findEntry(topics[0], self.entries, self.index)
+        targ.addEntry(entry, topics[1:])
+    
+    def toHtml(self, key, body, amap):
+        """
+        Add one section of the index to the html
+        """
+        if len(key) > 0:
+            body.append(Index.addName(key, self.index.sectionStyle))
+        for ekey in sorted(self.entries.keys()):
+            self.entries[ekey].toHtml(body, 0, amap)
+
+class Index:
+    """
+    This class generates an alphabetical index from the index markers in a docx file.
+
+    Each field in the parse of the docx file contains an instructions list.
+    Instructions with name XE are index instructions. 
+    The instruction also contains the entry specifier, of the form A[:B[:C]] for
+    main entry, A, subentry B, and so on.
+
+    The index object is a dictionary of sections, 'A' mapping to a section
+    object with all the A entries, and so on. Each section in turn is a dictionary
+    mapping an index specifier, like A:B, to a list of locations where that
+    entry is referenced.
+
+    We could make the formatting more configurable.
+    Currently it uses fixed styles for the various elements, and a section
+    heading for each letter.
+    """
+
+    def __init__(self, convert):
+        """
+        Convert the index markers in the document into an index object.
+        """
+        self.convert = convert
+        self.sections = {}
+
+        self.genStyles()
+
+        # Get a list of [name, field] entries, where name is the index
+        # entry and field is the indexed location
+        self.entries = self.getEntries()
+
+        # Find styles which are provide the text for links.
+        self.targetStyles()
+
+        # Generate bookmarks in the document at the indexed locations
+        self.bookmarks()
+
+        # Set up the entries in index sections
+        for unit in self.entries:
+            sec = self.findSection(unit[0])
+            sec.addEntry(unit)
+
+    def getEntries(self):
+        """
+        We already have a list of fields which includes the index marks,
+        identified by an XE tag.
+        In the base case, the field object includes an instruction list 
+        with one tuple like ('XE', '"entry"'), where entry is the text we
+        want to put in the index. Note the double quotes around the entry.
+        Sometimes the entry is broken up in the document, for example if
+        there are spelling issues in the entry text.
+        In this case, for reasons I don't understand, the instruction
+        list includes a number of tuples, and we get the actual entry
+        text by concatenating all of them after the initial tag.
+        There can be formatting information in the instructions also, after
+        the double quoted part, like '"entry" \b'.
+        So, we want to concatenate all parts after the initial tag, and
+        then get the part in double quotes.
+        """
+        fields = self.convert.fields.fields
+
+        # Only want the index entries
+        fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields)
+        return map(lambda f: [self.getEntry(f), f], fields)
+    
+    def getEntry(self, field):
+
+        elist = [field.instructions[0][1]]
+        for inst in field.instructions[1:]:
+            elist.append(inst[0])
+            elist.append(inst[1])
+
+        entry = ''.join(elist)
+        sep1 = entry.partition('"')
+        if sep1[2] == '':
+            return entry
+        sep2 = sep1[2].partition('"')
+        return sep2[0]
+
+    def targetStyles(self):
+        """
+        We want to get a list of styles which represent valid index targets.
+        That is, the text of a link in the index will be the title of the 
+        section of the document containing the indexed location.
+        We want the list of styles which can provide a valid title.
+        In practice, this maps to Heading1 through Heading3 in the original document.
+        Calibre apparently preprocesses docx files, so that a paragraph in
+        the original with style Heading1 will now have a different, internal style.
+        In this version we use convert.styles.id_map to find style ids
+        with internal names beginning Heading; but I'd feel better if we
+        jumped in earlier and could map it to the original docx styles.
+        """
+        smap = self.convert.styles.id_map
+        self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')]
+
+    def isHeading(self, node):
+        """
+        Return true if the input node is a valid index link target.
+        """
+        snodes = XPath("./w:pPr/w:pStyle")(node)
+        if len(snodes) == 0:
+            return False;
+
+        sn = snodes[0]
+
+        # The key includes the long namespace information
+        k = [key for key in sn.keys() if key.endswith('}val')]
+        if len(k) == 0:
+            return False
+        style = sn.get(k[0])
+        return style in self.targstyles
+
+    def getHeadings(self, node):
+        """
+        Get a list of all children of the input node which are headings -
+        that is, valid targets for an index link
+        """
+        answer = []
+        for c in node.getchildren():
+            if self.isHeading(c):
+                answer.append(c)
+        return answer
+
+    def textValue(self, node):
+        tnodes = XPath("./w:r/w:t")(node)
+        if len(tnodes) == 0:
+            return 'Link'
+        textl = map(lambda x: x.text, tnodes)
+        return ''.join(textl)
+
+    def findTarget(self, node):
+        """
+        Given an index entry, find the text of the last heading section
+        preceding the entry.
+        To do this, find the containing w:p element. If it is a heading,
+        return the text.
+        Otherwise, go up the document level by level, staring with the
+        parent of the w:p element containing the entry.
+        At each level, get the list of heading w:p elements which are 
+        children of the top node. We also have the index in the top node
+        of the child node containing the entry.
+        Find the largest index of a heading child which is < the entry
+        index, if any - that is the heading we want.
+        Perhaps we should precalculate some of this.
+        We could also consider doing some of this in xpath, but the style
+        attributes have been modified, so we can't just look for the
+        original names.
+        """
+        pnode = ancestor(node, 'w:p')
+        if self.isHeading(pnode):
+            return self.textValue(pnode)
+
+        while True:
+            parent = pnode.getparent()
+            if parent == None:
+                return 'Link'
+
+            # Maintain document order in these lists
+            pindex = parent.index(pnode)
+            hlist = self.getHeadings(parent)
+            hlist = filter(lambda x: parent.index(x) < pindex, hlist)
+            if len(hlist) > 0:
+                return self.textValue(hlist[-1])
+
+            # Try again
+            pnode = parent
+
+    def bookmarks(self):
+        """
+        For each index entry we need to insert a bookmark at the target location.
+        These bookmarks are for our internal use - I'm not sure they would work well
+        in the original docx document.
+        For each entry we have the Field object, which includes the instrText
+        element of the document.
+        Try going to the parent, and inserting a bookmark start just before it.
+        """
+        bmno = 0
+        for entry in self.entries:
+            for instnode in entry[1].elements:
+                name = 'indexBookmark' + str(bmno)
+                bmno += 1
+                tag = "{%s}bookmarkStart" % namespaces['w']
+                att = "{%s}name" % namespaces['w']
+                bookmark = lxml.etree.Element(tag)
+                bookmark.set(att, name)
+                rnode = instnode.getparent()
+
+                # Add the name so that we can link to it
+                entry.append(name)
+
+                # insert the bookmark before rnode
+                rparent = rnode.getparent()
+                rind = rparent.index(rnode)
+                rparent.insert(rind, bookmark)
+
+                # We want the index entry to be the content of the closest
+                # preceding Heading paragraph.
+                # We should make the targets configurable, and add chapter
+                # titles and maybe other things.
+                # What about numbering?
+                targnode = self.findTarget(rnode)
+                entry.append(targnode)
+
+    def genStyles(self):
+        """
+        Generate css styles for the index elements.
+        We do title, section header, and three levels of entries.
+        These are reasonable styles which only set a couple of key
+        values, but we could provide an interface to allow the user to set them.
+        Is there any problem registering the styles this early in the 
+        conversion process?
+        """
+        # The result is a string we can use as a class name.
+        css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
+        self.titleStyle = self.convert.styles.register(css, 'block')
+
+        css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
+        self.sectionStyle = self.convert.styles.register(css, 'block')
+
+        self.entryStyles = []
+        for i in range(3):
+            indent = str(i*20) + 'pt'
+            css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
+            self.entryStyles.append(self.convert.styles.register(css, 'block'))
+
+    def findSection(self, tag):
+        """
+        Find the section for this index entry, creating it if required.
+        The tag has a form like A or A:B or etc.
+        If you want a single index without section divisions, you can
+        just return the single section here every time.
+        """
+        shead = tag[0]
+
+        # Make it lower case, and group all non-alphabetic things together
+        if shead.isalpha():
+            shead = shead.lower()
+        else:
+            shead = ''
+
+        if shead in self.sections:
+            return self.sections[shead]
+        sect = Section(self)
+        self.sections[shead] = sect
+        return sect
+
+    def generate(self):
+        """
+        We generated the index object in the constructor.
+        This method writes it into the html.
+        """
+        body = self.convert.body
+        body.append(Index.addName('Index', self.titleStyle))
+
+        # And write them to the html
+        for key in sorted(self.sections.keys()):
+            self.sections[key].toHtml(key, body, self.convert.anchor_map)
+
+    @staticmethod
+    def addName(str, clname):
+        # Put this into the convert document map?
+        dest = P()
+        dest.set('class', clname)
+        span = SPAN()
+        from calibre.ebooks.docx.to_html import Text
+        text = Text(span, 'text', [])
+        text.buf.append(str)
+        setattr(text.elem, text.attr, ''.join(text.buf))
+        dest.append(span)
+        return dest
+    
+    @staticmethod
+    def findEntry(value, dict, index):
+        """
+        Find the Entry in the dictionary, or create a new one.
+        We convert to lower case to group all capitalizations
+        together as a single entry.
+        """
+        lvalue = value.lower()
+        if lvalue in dict:
+            return dict[lvalue]
+        ent = Entry(value, index)
+        dict[lvalue] = ent
+        return ent
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
 from calibre.ebooks.docx.toc import create_toc
 from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.docx.settings import Settings
+from calibre.ebooks.docx.index import Index
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

@ -44,12 +45,13 @@ class Text:

 class Convert(object):

-    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
+    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
+        self.do_index = do_index
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
@ -97,6 +99,14 @@ class Convert(object):
        paras = []

        self.log.debug('Converting Word markup to HTML')
+
+        # If we are doing an index, do the body part of the processing here.
+        # We need to insert bookmarks at the indexed locations before the
+        # main conversion work.
+        if self.do_index:
+            self.log.debug('Generating index')
+            index    = Index(self)
+
        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
@ -105,6 +115,7 @@ class Convert(object):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
+
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
@ -157,6 +168,10 @@ class Convert(object):
                    parent.text = tabs[-1].tail or ''
                    map(parent.remove, tabs)

+        # For an index, we now want to append the index object
+        if self.do_index:
+            index.generate()
+
        self.images.rid_map = orig_rid_map

        self.resolve_links()
--- a/src/calibre/gui2/convert/docx_input.py
+++ b/src/calibre/gui2/convert/docx_input.py
@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):

    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent,
-            ['docx_no_cover', ])
+            ['docx_no_cover', 'docx_index', ])
        self.initialize_options(get_option, get_help, db, book_id)

--- a/src/calibre/gui2/convert/docx_input.ui
+++ b/src/calibre/gui2/convert/docx_input.ui
@ -21,6 +21,13 @@
     </property>
    </widget>
   </item>
+   <item>
+    <widget class="QCheckBox" name="opt_docx_index">
+     <property name="text">
+      <string>Generate an alphabetical index from embedded index markers</string>
+     </property>
+    </widget>
+   </item>
   <item>
    <spacer name="verticalSpacer">
     <property name="orientation">