From f790038819c45614eb0ebff141600da28f63d28d Mon Sep 17 00:00:00 2001 From: Peter Garst Date: Tue, 25 Mar 2014 15:26:20 -0700 Subject: [PATCH] This material adds the option to generate an alphabetical index from index markers in a docx file. --- .../ebooks/conversion/plugins/docx_input.py | 5 +- src/calibre/ebooks/docx/container.py | 3 +- src/calibre/ebooks/docx/fields.py | 4 + src/calibre/ebooks/docx/index.py | 427 ++++++++++++++++++ src/calibre/ebooks/docx/to_html.py | 17 +- src/calibre/gui2/convert/docx_input.py | 2 +- src/calibre/gui2/convert/docx_input.ui | 7 + 7 files changed, 461 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/docx/index.py diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py index 30a4bb3868..fe987ea28f 100644 --- a/src/calibre/ebooks/conversion/plugins/docx_input.py +++ b/src/calibre/ebooks/conversion/plugins/docx_input.py @@ -19,6 +19,9 @@ class DOCXInput(InputFormatPlugin): help=_('Normally, if a large image is present at the start of the document that looks like a cover, ' 'it will be removed from the document and used as the cover for created ebook. This option ' 'turns off that behavior.')), + OptionRecommendation(name='docx_index', recommended_value=False, + help=_('If there are embedded index markers in the document, this option will use them to create ' + 'an alphabetical index with links to the locations of the markers.')), } @@ -26,5 +29,5 @@ class DOCXInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.docx.to_html import Convert - return Convert(stream, detect_cover=not options.docx_no_cover, log=log)() + return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)() diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index deaf5bd4d0..82b3fc9f4b 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -22,7 +22,8 @@ from calibre.utils.zipfile import ZipFile from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER def fromstring(raw, parser=RECOVER_PARSER): - return etree.fromstring(raw, parser=parser) + res = etree.fromstring(raw, parser=parser) + return res # Read metadata {{{ def read_doc_props(raw, mi): diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 6617728f0c..189927a6c6 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -10,12 +10,15 @@ import re from calibre.ebooks.docx.names import XPath, get +import sys + class Field(object): def __init__(self, start): self.start = start self.end = None self.contents = [] + self.elements = [] self.instructions = [] def add_instr(self, elem): @@ -24,6 +27,7 @@ class Field(object): return name, rest = raw.strip().partition(' ')[0::2] self.instructions.append((name, rest.strip())) + self.elements.append(elem) WORD, FLAG = 0, 1 scanner = re.Scanner([ diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py new file mode 100644 index 0000000000..23865210a0 --- /dev/null +++ b/src/calibre/ebooks/docx/index.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +import itertools +from collections import OrderedDict +from lxml import html +from lxml.html.builder import ( + HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1) +from calibre.ebooks.docx.names import ( + XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, + ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS) + +import lxml.etree + +NBSP = '\xa0' + +class Location: + """ + This class represents one location in the index. + We should provide a way to mark the main entries. Libre office + has a main attribute, which doesn't seem to map to docx, and at least + some versions of word can mark entries bold or italic with \b and \i. + One index entry corresponds to a list of locations where the entry + is referenced in the text. + """ + + def __init__(self, bookmark, target): + self.bookmark = bookmark + self.target = target + +class Entry: + """ + This class represents one index entry. + We can also have a list of subentries for the primary/secondary + topic situation. + Each entry has a list of locations we want to point to, but + it could be empty if this is only here to organize subentries. + """ + + def __init__(self, name, index): + self.subentries = {} + self.locations = [] + self.name = name + self.index = index + + def addEntry(self, entry, sub): + """ + The entry has the form [xxx, field, bookmark, target] + """ + if len(sub) == 0: + self.locations.append(Location(entry[2], entry[3])) + else: + sube = Index.findEntry(sub[0], self.subentries, self.index) + sube.addEntry(entry, sub[1:]) + + def makeLink(self, loc, amap): + # As a first pass, we just put a placeholder in the target location + # We want it to float right + markid = amap[loc.bookmark] + if markid == None: + return + + span = A() + span.set('style', 'float:right') + span.set('href', '#' + markid) + from calibre.ebooks.docx.to_html import Text + text = Text(span, 'text', []) + text.buf.append(loc.target) + setattr(text.elem, text.attr, ''.join(text.buf)) + return span + + def toHtmlUnit(self, body, level, amap): + """ + Append the material for one index entry to the document. + There is a name, and 0 or more locations. + Put the first location, if any, on the same line as the + name, and others on following lines. + """ + style = self.index.entryStyles[level] + main = Index.addName(self.name, style) + if len(self.locations) == 0: + body.append(main) + return + + # First link on same line as name + link = self.makeLink(self.locations[0], amap) + main.append(link) + body.append(main) + + # Put other links for same entry on their own lines + # To keep the link span separate need to put a space as the name + for l in self.locations[1:]: + link = self.makeLink(l, amap) + dest = P() + dest.set('class', style) + dest.text = NBSP + dest.append(link) + body.append(dest) + + def toHtml(self, body, level, amap): + level = min(level, 2) + self.toHtmlUnit(body, level, amap) + for key in sorted(self.subentries.keys()): + self.subentries[key].toHtml(body, level + 1, amap) + +class Section: + """ + This class represents one section of the index - usually, + for example, the A's or the B's. + It is primarily a dictionary of entries. + """ + + def __init__(self, index): + self.index = index + self.entries = {} + + def addEntry(self, entry): + """ + We have information from one index marker. + The entry has form [name, field, bookmark, target]. + The name is something like A or A:B and so on. + If we already have an entry for that name, just add the new + location to it; otherwise create a new entry. + """ + topics = entry[0].strip('"').split(':') + targ = Index.findEntry(topics[0], self.entries, self.index) + targ.addEntry(entry, topics[1:]) + + def toHtml(self, key, body, amap): + """ + Add one section of the index to the html + """ + if len(key) > 0: + body.append(Index.addName(key, self.index.sectionStyle)) + for ekey in sorted(self.entries.keys()): + self.entries[ekey].toHtml(body, 0, amap) + +class Index: + """ + This class generates an alphabetical index from the index markers in a docx file. + + Each field in the parse of the docx file contains an instructions list. + Instructions with name XE are index instructions. + The instruction also contains the entry specifier, of the form A[:B[:C]] for + main entry, A, subentry B, and so on. + + The index object is a dictionary of sections, 'A' mapping to a section + object with all the A entries, and so on. Each section in turn is a dictionary + mapping an index specifier, like A:B, to a list of locations where that + entry is referenced. + + We could make the formatting more configurable. + Currently it uses fixed styles for the various elements, and a section + heading for each letter. + """ + + def __init__(self, convert): + """ + Convert the index markers in the document into an index object. + """ + self.convert = convert + self.sections = {} + + self.genStyles() + + # Get a list of [name, field] entries, where name is the index + # entry and field is the indexed location + self.entries = self.getEntries() + + # Find styles which are provide the text for links. + self.targetStyles() + + # Generate bookmarks in the document at the indexed locations + self.bookmarks() + + # Set up the entries in index sections + for unit in self.entries: + sec = self.findSection(unit[0]) + sec.addEntry(unit) + + def getEntries(self): + """ + We already have a list of fields which includes the index marks, + identified by an XE tag. + In the base case, the field object includes an instruction list + with one tuple like ('XE', '"entry"'), where entry is the text we + want to put in the index. Note the double quotes around the entry. + Sometimes the entry is broken up in the document, for example if + there are spelling issues in the entry text. + In this case, for reasons I don't understand, the instruction + list includes a number of tuples, and we get the actual entry + text by concatenating all of them after the initial tag. + There can be formatting information in the instructions also, after + the double quoted part, like '"entry" \b'. + So, we want to concatenate all parts after the initial tag, and + then get the part in double quotes. + """ + fields = self.convert.fields.fields + + # Only want the index entries + fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields) + return map(lambda f: [self.getEntry(f), f], fields) + + def getEntry(self, field): + + elist = [field.instructions[0][1]] + for inst in field.instructions[1:]: + elist.append(inst[0]) + elist.append(inst[1]) + + entry = ''.join(elist) + sep1 = entry.partition('"') + if sep1[2] == '': + return entry + sep2 = sep1[2].partition('"') + return sep2[0] + + def targetStyles(self): + """ + We want to get a list of styles which represent valid index targets. + That is, the text of a link in the index will be the title of the + section of the document containing the indexed location. + We want the list of styles which can provide a valid title. + In practice, this maps to Heading1 through Heading3 in the original document. + Calibre apparently preprocesses docx files, so that a paragraph in + the original with style Heading1 will now have a different, internal style. + In this version we use convert.styles.id_map to find style ids + with internal names beginning Heading; but I'd feel better if we + jumped in earlier and could map it to the original docx styles. + """ + smap = self.convert.styles.id_map + self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')] + + def isHeading(self, node): + """ + Return true if the input node is a valid index link target. + """ + snodes = XPath("./w:pPr/w:pStyle")(node) + if len(snodes) == 0: + return False; + + sn = snodes[0] + + # The key includes the long namespace information + k = [key for key in sn.keys() if key.endswith('}val')] + if len(k) == 0: + return False + style = sn.get(k[0]) + return style in self.targstyles + + def getHeadings(self, node): + """ + Get a list of all children of the input node which are headings - + that is, valid targets for an index link + """ + answer = [] + for c in node.getchildren(): + if self.isHeading(c): + answer.append(c) + return answer + + def textValue(self, node): + tnodes = XPath("./w:r/w:t")(node) + if len(tnodes) == 0: + return 'Link' + textl = map(lambda x: x.text, tnodes) + return ''.join(textl) + + def findTarget(self, node): + """ + Given an index entry, find the text of the last heading section + preceding the entry. + To do this, find the containing w:p element. If it is a heading, + return the text. + Otherwise, go up the document level by level, staring with the + parent of the w:p element containing the entry. + At each level, get the list of heading w:p elements which are + children of the top node. We also have the index in the top node + of the child node containing the entry. + Find the largest index of a heading child which is < the entry + index, if any - that is the heading we want. + Perhaps we should precalculate some of this. + We could also consider doing some of this in xpath, but the style + attributes have been modified, so we can't just look for the + original names. + """ + pnode = ancestor(node, 'w:p') + if self.isHeading(pnode): + return self.textValue(pnode) + + while True: + parent = pnode.getparent() + if parent == None: + return 'Link' + + # Maintain document order in these lists + pindex = parent.index(pnode) + hlist = self.getHeadings(parent) + hlist = filter(lambda x: parent.index(x) < pindex, hlist) + if len(hlist) > 0: + return self.textValue(hlist[-1]) + + # Try again + pnode = parent + + def bookmarks(self): + """ + For each index entry we need to insert a bookmark at the target location. + These bookmarks are for our internal use - I'm not sure they would work well + in the original docx document. + For each entry we have the Field object, which includes the instrText + element of the document. + Try going to the parent, and inserting a bookmark start just before it. + """ + bmno = 0 + for entry in self.entries: + for instnode in entry[1].elements: + name = 'indexBookmark' + str(bmno) + bmno += 1 + tag = "{%s}bookmarkStart" % namespaces['w'] + att = "{%s}name" % namespaces['w'] + bookmark = lxml.etree.Element(tag) + bookmark.set(att, name) + rnode = instnode.getparent() + + # Add the name so that we can link to it + entry.append(name) + + # insert the bookmark before rnode + rparent = rnode.getparent() + rind = rparent.index(rnode) + rparent.insert(rind, bookmark) + + # We want the index entry to be the content of the closest + # preceding Heading paragraph. + # We should make the targets configurable, and add chapter + # titles and maybe other things. + # What about numbering? + targnode = self.findTarget(rnode) + entry.append(targnode) + + def genStyles(self): + """ + Generate css styles for the index elements. + We do title, section header, and three levels of entries. + These are reasonable styles which only set a couple of key + values, but we could provide an interface to allow the user to set them. + Is there any problem registering the styles this early in the + conversion process? + """ + # The result is a string we can use as a class name. + css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')]) + self.titleStyle = self.convert.styles.register(css, 'block') + + css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')]) + self.sectionStyle = self.convert.styles.register(css, 'block') + + self.entryStyles = [] + for i in range(3): + indent = str(i*20) + 'pt' + css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)]) + self.entryStyles.append(self.convert.styles.register(css, 'block')) + + def findSection(self, tag): + """ + Find the section for this index entry, creating it if required. + The tag has a form like A or A:B or etc. + If you want a single index without section divisions, you can + just return the single section here every time. + """ + shead = tag[0] + + # Make it lower case, and group all non-alphabetic things together + if shead.isalpha(): + shead = shead.lower() + else: + shead = '' + + if shead in self.sections: + return self.sections[shead] + sect = Section(self) + self.sections[shead] = sect + return sect + + def generate(self): + """ + We generated the index object in the constructor. + This method writes it into the html. + """ + body = self.convert.body + body.append(Index.addName('Index', self.titleStyle)) + + # And write them to the html + for key in sorted(self.sections.keys()): + self.sections[key].toHtml(key, body, self.convert.anchor_map) + + @staticmethod + def addName(str, clname): + # Put this into the convert document map? + dest = P() + dest.set('class', clname) + span = SPAN() + from calibre.ebooks.docx.to_html import Text + text = Text(span, 'text', []) + text.buf.append(str) + setattr(text.elem, text.attr, ''.join(text.buf)) + dest.append(span) + return dest + + @staticmethod + def findEntry(value, dict, index): + """ + Find the Entry in the dictionary, or create a new one. + We convert to lower case to group all capitalizations + together as a single entry. + """ + lvalue = value.lower() + if lvalue in dict: + return dict[lvalue] + ent = Entry(value, index) + dict[lvalue] = ent + return ent diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 1b1622d219..3b0be2febf 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.toc import create_toc from calibre.ebooks.docx.fields import Fields from calibre.ebooks.docx.settings import Settings +from calibre.ebooks.docx.index import Index from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 @@ -44,12 +45,13 @@ class Text: class Convert(object): - def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): + def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None): self.docx = DOCX(path_or_stream, log=log) self.ms_pat = re.compile(r'\s{2,}') self.ws_pat = re.compile(r'[\n\r\t]') self.log = self.docx.log self.detect_cover = detect_cover + self.do_index = do_index self.notes_text = notes_text or _('Notes') self.dest_dir = dest_dir or os.getcwdu() self.mi = self.docx.metadata @@ -97,6 +99,14 @@ class Convert(object): paras = [] self.log.debug('Converting Word markup to HTML') + + # If we are doing an index, do the body part of the processing here. + # We need to insert bookmarks at the indexed locations before the + # main conversion work. + if self.do_index: + self.log.debug('Generating index') + index = Index(self) + self.read_page_properties(doc) self.current_rels = relationships_by_id for wp, page_properties in self.page_map.iteritems(): @@ -105,6 +115,7 @@ class Convert(object): p = self.convert_p(wp) self.body.append(p) paras.append(wp) + self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) # Apply page breaks at the start of every section, except the first @@ -157,6 +168,10 @@ class Convert(object): parent.text = tabs[-1].tail or '' map(parent.remove, tabs) + # For an index, we now want to append the index object + if self.do_index: + index.generate() + self.images.rid_map = orig_rid_map self.resolve_links() diff --git a/src/calibre/gui2/convert/docx_input.py b/src/calibre/gui2/convert/docx_input.py index 46234c6a36..0fbfb66634 100644 --- a/src/calibre/gui2/convert/docx_input.py +++ b/src/calibre/gui2/convert/docx_input.py @@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['docx_no_cover', ]) + ['docx_no_cover', 'docx_index', ]) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/docx_input.ui b/src/calibre/gui2/convert/docx_input.ui index 41948118dc..8f3843dd2a 100644 --- a/src/calibre/gui2/convert/docx_input.ui +++ b/src/calibre/gui2/convert/docx_input.ui @@ -21,6 +21,13 @@ + + + + Generate an alphabetical index from embedded index markers + + +