This material adds the option to generate an alphabetical index from index markers in a docx file.

2025-08-30 23:00:21 -04:00 · 2014-03-25 15:26:20 -07:00 · 2014-03-25 15:26:20 -07:00 · f790038819
commit f790038819
parent 290462909f
7 changed files with 461 additions and 4 deletions
--- a/src/calibre/ebooks/conversion/plugins/docx_input.py
+++ b/src/calibre/ebooks/conversion/plugins/docx_input.py
@ -19,6 +19,9 @@ class DOCXInput(InputFormatPlugin):
            help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
                   'it will be removed from the document and used as the cover for created ebook. This option '
                   'turns off that behavior.')),
        OptionRecommendation(name='docx_index', recommended_value=False,
            help=_('If there are embedded index markers in the document, this option will use them to create '
                   'an alphabetical index with links to the locations of the markers.')),
    }
@ -26,5 +29,5 @@ class DOCXInput(InputFormatPlugin):
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.docx.to_html import Convert
-        return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()
+        return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)()
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@ -22,7 +22,8 @@ from calibre.utils.zipfile import ZipFile
 from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
 def fromstring(raw, parser=RECOVER_PARSER):
-    return etree.fromstring(raw, parser=parser)
+    res = etree.fromstring(raw, parser=parser)
    return res
 # Read metadata {{{
 def read_doc_props(raw, mi):
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -10,12 +10,15 @@ import re
 from calibre.ebooks.docx.names import XPath, get
 import sys
 class Field(object):
    def __init__(self, start):
        self.start = start
        self.end = None
        self.contents = []
        self.elements = []
        self.instructions = []
    def add_instr(self, elem):
@ -24,6 +27,7 @@ class Field(object):
            return
        name, rest = raw.strip().partition(' ')[0::2]
        self.instructions.append((name, rest.strip()))
        self.elements.append(elem)
 WORD, FLAG = 0, 1
 scanner = re.Scanner([
--- a/src/calibre/ebooks/docx/index.py
+++ b/src/calibre/ebooks/docx/index.py
@ -0,0 +1,427 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
 import itertools
 from collections import OrderedDict
 from lxml import html
 from lxml.html.builder import (
    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
 from calibre.ebooks.docx.names import (
    XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
    ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
 import lxml.etree
 NBSP = '\xa0'
 class Location:
    """
    This class represents one location in the index.
    We should provide a way to mark the main entries. Libre office
    has a main attribute, which doesn't seem to map to docx, and at least
    some versions of word can mark entries bold or italic with \b and \i.
    One index entry corresponds to a list of locations where the entry
    is referenced in the text.
    """
    def __init__(self, bookmark, target):
        self.bookmark = bookmark
        self.target = target
 class Entry:
    """
    This class represents one index entry.
    We can also have a list of subentries for the primary/secondary
    topic situation.
    Each entry has a list of locations we want to point to, but
    it could be empty if this is only here to organize subentries.
    """
    def __init__(self, name, index):
        self.subentries = {}
        self.locations = []
        self.name = name
        self.index = index
    def addEntry(self, entry, sub):
        """
        The entry has the form [xxx, field, bookmark, target]
        """
        if len(sub) == 0:
            self.locations.append(Location(entry[2], entry[3]))
        else:
            sube = Index.findEntry(sub[0], self.subentries, self.index)
            sube.addEntry(entry, sub[1:])
    def makeLink(self, loc, amap):
        # As a first pass, we just put a placeholder in the target location
        # We want it to float right
        markid = amap[loc.bookmark]
        if markid == None:
            return
        span = A()
        span.set('style', 'float:right')
        span.set('href', '#' + markid)
        from calibre.ebooks.docx.to_html import Text
        text = Text(span, 'text', [])
        text.buf.append(loc.target)
        setattr(text.elem, text.attr, ''.join(text.buf))
        return span
    def toHtmlUnit(self, body, level, amap):
        """
        Append the material for one index entry to the document.
        There is a name, and 0 or more locations.
        Put the first location, if any, on the same line as the
        name, and others on following lines.
        """
        style = self.index.entryStyles[level]
        main = Index.addName(self.name, style)
        if len(self.locations) == 0:
            body.append(main)
            return
        # First link on same line as name
        link = self.makeLink(self.locations[0], amap)
        main.append(link)
        body.append(main)
        # Put other links for same entry on their own lines
        # To keep the link span separate need to put a space as the name
        for l in self.locations[1:]:
            link = self.makeLink(l, amap)
            dest = P()
            dest.set('class', style)
            dest.text = NBSP
            dest.append(link)
            body.append(dest)
    def toHtml(self, body, level, amap):
        level = min(level, 2)
        self.toHtmlUnit(body, level, amap)
        for key in sorted(self.subentries.keys()):
            self.subentries[key].toHtml(body, level + 1, amap)
 class Section:
    """
    This class represents one section of the index - usually,
    for example, the A's or the B's.
    It is primarily a dictionary of entries.
    """
    def __init__(self, index):
        self.index = index
        self.entries = {}
    def addEntry(self, entry):
        """
        We have information from one index marker.
        The entry has form [name, field, bookmark, target].
        The name is something like A or A:B and so on.
        If we already have an entry for that name, just add the new
        location to it; otherwise create a new entry.
        """
        topics = entry[0].strip('"').split(':')
        targ = Index.findEntry(topics[0], self.entries, self.index)
        targ.addEntry(entry, topics[1:])
    def toHtml(self, key, body, amap):
        """
        Add one section of the index to the html
        """
        if len(key) > 0:
            body.append(Index.addName(key, self.index.sectionStyle))
        for ekey in sorted(self.entries.keys()):
            self.entries[ekey].toHtml(body, 0, amap)
 class Index:
    """
    This class generates an alphabetical index from the index markers in a docx file.
    Each field in the parse of the docx file contains an instructions list.
    Instructions with name XE are index instructions. 
    The instruction also contains the entry specifier, of the form A[:B[:C]] for
    main entry, A, subentry B, and so on.
    The index object is a dictionary of sections, 'A' mapping to a section
    object with all the A entries, and so on. Each section in turn is a dictionary
    mapping an index specifier, like A:B, to a list of locations where that
    entry is referenced.
    We could make the formatting more configurable.
    Currently it uses fixed styles for the various elements, and a section
    heading for each letter.
    """
    def __init__(self, convert):
        """
        Convert the index markers in the document into an index object.
        """
        self.convert = convert
        self.sections = {}
        self.genStyles()
        # Get a list of [name, field] entries, where name is the index
        # entry and field is the indexed location
        self.entries = self.getEntries()
        # Find styles which are provide the text for links.
        self.targetStyles()
        # Generate bookmarks in the document at the indexed locations
        self.bookmarks()
        # Set up the entries in index sections
        for unit in self.entries:
            sec = self.findSection(unit[0])
            sec.addEntry(unit)
    def getEntries(self):
        """
        We already have a list of fields which includes the index marks,
        identified by an XE tag.
        In the base case, the field object includes an instruction list 
        with one tuple like ('XE', '"entry"'), where entry is the text we
        want to put in the index. Note the double quotes around the entry.
        Sometimes the entry is broken up in the document, for example if
        there are spelling issues in the entry text.
        In this case, for reasons I don't understand, the instruction
        list includes a number of tuples, and we get the actual entry
        text by concatenating all of them after the initial tag.
        There can be formatting information in the instructions also, after
        the double quoted part, like '"entry" \b'.
        So, we want to concatenate all parts after the initial tag, and
        then get the part in double quotes.
        """
        fields = self.convert.fields.fields
        # Only want the index entries
        fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields)
        return map(lambda f: [self.getEntry(f), f], fields)
    def getEntry(self, field):
        elist = [field.instructions[0][1]]
        for inst in field.instructions[1:]:
            elist.append(inst[0])
            elist.append(inst[1])
        entry = ''.join(elist)
        sep1 = entry.partition('"')
        if sep1[2] == '':
            return entry
        sep2 = sep1[2].partition('"')
        return sep2[0]
    def targetStyles(self):
        """
        We want to get a list of styles which represent valid index targets.
        That is, the text of a link in the index will be the title of the 
        section of the document containing the indexed location.
        We want the list of styles which can provide a valid title.
        In practice, this maps to Heading1 through Heading3 in the original document.
        Calibre apparently preprocesses docx files, so that a paragraph in
        the original with style Heading1 will now have a different, internal style.
        In this version we use convert.styles.id_map to find style ids
        with internal names beginning Heading; but I'd feel better if we
        jumped in earlier and could map it to the original docx styles.
        """
        smap = self.convert.styles.id_map
        self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')]
    def isHeading(self, node):
        """
        Return true if the input node is a valid index link target.
        """
        snodes = XPath("./w:pPr/w:pStyle")(node)
        if len(snodes) == 0:
            return False;
        sn = snodes[0]
        # The key includes the long namespace information
        k = [key for key in sn.keys() if key.endswith('}val')]
        if len(k) == 0:
            return False
        style = sn.get(k[0])
        return style in self.targstyles
    def getHeadings(self, node):
        """
        Get a list of all children of the input node which are headings -
        that is, valid targets for an index link
        """
        answer = []
        for c in node.getchildren():
            if self.isHeading(c):
                answer.append(c)
        return answer
    def textValue(self, node):
        tnodes = XPath("./w:r/w:t")(node)
        if len(tnodes) == 0:
            return 'Link'
        textl = map(lambda x: x.text, tnodes)
        return ''.join(textl)
    def findTarget(self, node):
        """
        Given an index entry, find the text of the last heading section
        preceding the entry.
        To do this, find the containing w:p element. If it is a heading,
        return the text.
        Otherwise, go up the document level by level, staring with the
        parent of the w:p element containing the entry.
        At each level, get the list of heading w:p elements which are 
        children of the top node. We also have the index in the top node
        of the child node containing the entry.
        Find the largest index of a heading child which is < the entry
        index, if any - that is the heading we want.
        Perhaps we should precalculate some of this.
        We could also consider doing some of this in xpath, but the style
        attributes have been modified, so we can't just look for the
        original names.
        """
        pnode = ancestor(node, 'w:p')
        if self.isHeading(pnode):
            return self.textValue(pnode)
        while True:
            parent = pnode.getparent()
            if parent == None:
                return 'Link'
            # Maintain document order in these lists
            pindex = parent.index(pnode)
            hlist = self.getHeadings(parent)
            hlist = filter(lambda x: parent.index(x) < pindex, hlist)
            if len(hlist) > 0:
                return self.textValue(hlist[-1])
            # Try again
            pnode = parent
    def bookmarks(self):
        """
        For each index entry we need to insert a bookmark at the target location.
        These bookmarks are for our internal use - I'm not sure they would work well
        in the original docx document.
        For each entry we have the Field object, which includes the instrText
        element of the document.
        Try going to the parent, and inserting a bookmark start just before it.
        """
        bmno = 0
        for entry in self.entries:
            for instnode in entry[1].elements:
                name = 'indexBookmark' + str(bmno)
                bmno += 1
                tag = "{%s}bookmarkStart" % namespaces['w']
                att = "{%s}name" % namespaces['w']
                bookmark = lxml.etree.Element(tag)
                bookmark.set(att, name)
                rnode = instnode.getparent()
                # Add the name so that we can link to it
                entry.append(name)
                # insert the bookmark before rnode
                rparent = rnode.getparent()
                rind = rparent.index(rnode)
                rparent.insert(rind, bookmark)
                # We want the index entry to be the content of the closest
                # preceding Heading paragraph.
                # We should make the targets configurable, and add chapter
                # titles and maybe other things.
                # What about numbering?
                targnode = self.findTarget(rnode)
                entry.append(targnode)
    def genStyles(self):
        """
        Generate css styles for the index elements.
        We do title, section header, and three levels of entries.
        These are reasonable styles which only set a couple of key
        values, but we could provide an interface to allow the user to set them.
        Is there any problem registering the styles this early in the 
        conversion process?
        """
        # The result is a string we can use as a class name.
        css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
        self.titleStyle = self.convert.styles.register(css, 'block')
        css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
        self.sectionStyle = self.convert.styles.register(css, 'block')
        self.entryStyles = []
        for i in range(3):
            indent = str(i*20) + 'pt'
            css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
            self.entryStyles.append(self.convert.styles.register(css, 'block'))
    def findSection(self, tag):
        """
        Find the section for this index entry, creating it if required.
        The tag has a form like A or A:B or etc.
        If you want a single index without section divisions, you can
        just return the single section here every time.
        """
        shead = tag[0]
        # Make it lower case, and group all non-alphabetic things together
        if shead.isalpha():
            shead = shead.lower()
        else:
            shead = ''
        if shead in self.sections:
            return self.sections[shead]
        sect = Section(self)
        self.sections[shead] = sect
        return sect
    def generate(self):
        """
        We generated the index object in the constructor.
        This method writes it into the html.
        """
        body = self.convert.body
        body.append(Index.addName('Index', self.titleStyle))
        # And write them to the html
        for key in sorted(self.sections.keys()):
            self.sections[key].toHtml(key, body, self.convert.anchor_map)
    @staticmethod
    def addName(str, clname):
        # Put this into the convert document map?
        dest = P()
        dest.set('class', clname)
        span = SPAN()
        from calibre.ebooks.docx.to_html import Text
        text = Text(span, 'text', [])
        text.buf.append(str)
        setattr(text.elem, text.attr, ''.join(text.buf))
        dest.append(span)
        return dest
    @staticmethod
    def findEntry(value, dict, index):
        """
        Find the Entry in the dictionary, or create a new one.
        We convert to lower case to group all capitalizations
        together as a single entry.
        """
        lvalue = value.lower()
        if lvalue in dict:
            return dict[lvalue]
        ent = Entry(value, index)
        dict[lvalue] = ent
        return ent
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
 from calibre.ebooks.docx.toc import create_toc
 from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.docx.settings import Settings
 from calibre.ebooks.docx.index import Index
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -44,12 +45,13 @@ class Text:
 class Convert(object):
-    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
+    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.do_index = do_index
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
@ -97,6 +99,14 @@ class Convert(object):
        paras = []
        self.log.debug('Converting Word markup to HTML')
        # If we are doing an index, do the body part of the processing here.
        # We need to insert bookmarks at the indexed locations before the
        # main conversion work.
        if self.do_index:
            self.log.debug('Generating index')
            index    = Index(self)
        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
@ -105,6 +115,7 @@ class Convert(object):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
@ -157,6 +168,10 @@ class Convert(object):
                    parent.text = tabs[-1].tail or ''
                    map(parent.remove, tabs)
        # For an index, we now want to append the index object
        if self.do_index:
            index.generate()
        self.images.rid_map = orig_rid_map
        self.resolve_links()
--- a/src/calibre/gui2/convert/docx_input.py
+++ b/src/calibre/gui2/convert/docx_input.py
@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent,
-            ['docx_no_cover', ])
+            ['docx_no_cover', 'docx_index', ])
        self.initialize_options(get_option, get_help, db, book_id)
--- a/src/calibre/gui2/convert/docx_input.ui
+++ b/src/calibre/gui2/convert/docx_input.ui
@ -21,6 +21,13 @@
     </property>
    </widget>
   </item>
   <item>
    <widget class="QCheckBox" name="opt_docx_index">
     <property name="text">
      <string>Generate an alphabetical index from embedded index markers</string>
     </property>
    </widget>
   </item>
   <item>
    <spacer name="verticalSpacer">
     <property name="orientation">