This material adds the option to generate an alphabetical index from index markers in a docx file.

This commit is contained in:
Peter Garst 2014-03-25 15:26:20 -07:00
parent 290462909f
commit f790038819
7 changed files with 461 additions and 4 deletions

View File

@ -19,6 +19,9 @@ class DOCXInput(InputFormatPlugin):
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
'it will be removed from the document and used as the cover for created ebook. This option '
'turns off that behavior.')),
OptionRecommendation(name='docx_index', recommended_value=False,
help=_('If there are embedded index markers in the document, this option will use them to create '
'an alphabetical index with links to the locations of the markers.')),
}
@ -26,5 +29,5 @@ class DOCXInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.docx.to_html import Convert
return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()
return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)()

View File

@ -22,7 +22,8 @@ from calibre.utils.zipfile import ZipFile
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
def fromstring(raw, parser=RECOVER_PARSER):
return etree.fromstring(raw, parser=parser)
res = etree.fromstring(raw, parser=parser)
return res
# Read metadata {{{
def read_doc_props(raw, mi):

View File

@ -10,12 +10,15 @@ import re
from calibre.ebooks.docx.names import XPath, get
import sys
class Field(object):
def __init__(self, start):
self.start = start
self.end = None
self.contents = []
self.elements = []
self.instructions = []
def add_instr(self, elem):
@ -24,6 +27,7 @@ class Field(object):
return
name, rest = raw.strip().partition(' ')[0::2]
self.instructions.append((name, rest.strip()))
self.elements.append(elem)
WORD, FLAG = 0, 1
scanner = re.Scanner([

View File

@ -0,0 +1,427 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import itertools
from collections import OrderedDict
from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
import lxml.etree
NBSP = '\xa0'
class Location:
"""
This class represents one location in the index.
We should provide a way to mark the main entries. Libre office
has a main attribute, which doesn't seem to map to docx, and at least
some versions of word can mark entries bold or italic with \b and \i.
One index entry corresponds to a list of locations where the entry
is referenced in the text.
"""
def __init__(self, bookmark, target):
self.bookmark = bookmark
self.target = target
class Entry:
"""
This class represents one index entry.
We can also have a list of subentries for the primary/secondary
topic situation.
Each entry has a list of locations we want to point to, but
it could be empty if this is only here to organize subentries.
"""
def __init__(self, name, index):
self.subentries = {}
self.locations = []
self.name = name
self.index = index
def addEntry(self, entry, sub):
"""
The entry has the form [xxx, field, bookmark, target]
"""
if len(sub) == 0:
self.locations.append(Location(entry[2], entry[3]))
else:
sube = Index.findEntry(sub[0], self.subentries, self.index)
sube.addEntry(entry, sub[1:])
def makeLink(self, loc, amap):
# As a first pass, we just put a placeholder in the target location
# We want it to float right
markid = amap[loc.bookmark]
if markid == None:
return
span = A()
span.set('style', 'float:right')
span.set('href', '#' + markid)
from calibre.ebooks.docx.to_html import Text
text = Text(span, 'text', [])
text.buf.append(loc.target)
setattr(text.elem, text.attr, ''.join(text.buf))
return span
def toHtmlUnit(self, body, level, amap):
"""
Append the material for one index entry to the document.
There is a name, and 0 or more locations.
Put the first location, if any, on the same line as the
name, and others on following lines.
"""
style = self.index.entryStyles[level]
main = Index.addName(self.name, style)
if len(self.locations) == 0:
body.append(main)
return
# First link on same line as name
link = self.makeLink(self.locations[0], amap)
main.append(link)
body.append(main)
# Put other links for same entry on their own lines
# To keep the link span separate need to put a space as the name
for l in self.locations[1:]:
link = self.makeLink(l, amap)
dest = P()
dest.set('class', style)
dest.text = NBSP
dest.append(link)
body.append(dest)
def toHtml(self, body, level, amap):
level = min(level, 2)
self.toHtmlUnit(body, level, amap)
for key in sorted(self.subentries.keys()):
self.subentries[key].toHtml(body, level + 1, amap)
class Section:
"""
This class represents one section of the index - usually,
for example, the A's or the B's.
It is primarily a dictionary of entries.
"""
def __init__(self, index):
self.index = index
self.entries = {}
def addEntry(self, entry):
"""
We have information from one index marker.
The entry has form [name, field, bookmark, target].
The name is something like A or A:B and so on.
If we already have an entry for that name, just add the new
location to it; otherwise create a new entry.
"""
topics = entry[0].strip('"').split(':')
targ = Index.findEntry(topics[0], self.entries, self.index)
targ.addEntry(entry, topics[1:])
def toHtml(self, key, body, amap):
"""
Add one section of the index to the html
"""
if len(key) > 0:
body.append(Index.addName(key, self.index.sectionStyle))
for ekey in sorted(self.entries.keys()):
self.entries[ekey].toHtml(body, 0, amap)
class Index:
"""
This class generates an alphabetical index from the index markers in a docx file.
Each field in the parse of the docx file contains an instructions list.
Instructions with name XE are index instructions.
The instruction also contains the entry specifier, of the form A[:B[:C]] for
main entry, A, subentry B, and so on.
The index object is a dictionary of sections, 'A' mapping to a section
object with all the A entries, and so on. Each section in turn is a dictionary
mapping an index specifier, like A:B, to a list of locations where that
entry is referenced.
We could make the formatting more configurable.
Currently it uses fixed styles for the various elements, and a section
heading for each letter.
"""
def __init__(self, convert):
"""
Convert the index markers in the document into an index object.
"""
self.convert = convert
self.sections = {}
self.genStyles()
# Get a list of [name, field] entries, where name is the index
# entry and field is the indexed location
self.entries = self.getEntries()
# Find styles which are provide the text for links.
self.targetStyles()
# Generate bookmarks in the document at the indexed locations
self.bookmarks()
# Set up the entries in index sections
for unit in self.entries:
sec = self.findSection(unit[0])
sec.addEntry(unit)
def getEntries(self):
"""
We already have a list of fields which includes the index marks,
identified by an XE tag.
In the base case, the field object includes an instruction list
with one tuple like ('XE', '"entry"'), where entry is the text we
want to put in the index. Note the double quotes around the entry.
Sometimes the entry is broken up in the document, for example if
there are spelling issues in the entry text.
In this case, for reasons I don't understand, the instruction
list includes a number of tuples, and we get the actual entry
text by concatenating all of them after the initial tag.
There can be formatting information in the instructions also, after
the double quoted part, like '"entry" \b'.
So, we want to concatenate all parts after the initial tag, and
then get the part in double quotes.
"""
fields = self.convert.fields.fields
# Only want the index entries
fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields)
return map(lambda f: [self.getEntry(f), f], fields)
def getEntry(self, field):
elist = [field.instructions[0][1]]
for inst in field.instructions[1:]:
elist.append(inst[0])
elist.append(inst[1])
entry = ''.join(elist)
sep1 = entry.partition('"')
if sep1[2] == '':
return entry
sep2 = sep1[2].partition('"')
return sep2[0]
def targetStyles(self):
"""
We want to get a list of styles which represent valid index targets.
That is, the text of a link in the index will be the title of the
section of the document containing the indexed location.
We want the list of styles which can provide a valid title.
In practice, this maps to Heading1 through Heading3 in the original document.
Calibre apparently preprocesses docx files, so that a paragraph in
the original with style Heading1 will now have a different, internal style.
In this version we use convert.styles.id_map to find style ids
with internal names beginning Heading; but I'd feel better if we
jumped in earlier and could map it to the original docx styles.
"""
smap = self.convert.styles.id_map
self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')]
def isHeading(self, node):
"""
Return true if the input node is a valid index link target.
"""
snodes = XPath("./w:pPr/w:pStyle")(node)
if len(snodes) == 0:
return False;
sn = snodes[0]
# The key includes the long namespace information
k = [key for key in sn.keys() if key.endswith('}val')]
if len(k) == 0:
return False
style = sn.get(k[0])
return style in self.targstyles
def getHeadings(self, node):
"""
Get a list of all children of the input node which are headings -
that is, valid targets for an index link
"""
answer = []
for c in node.getchildren():
if self.isHeading(c):
answer.append(c)
return answer
def textValue(self, node):
tnodes = XPath("./w:r/w:t")(node)
if len(tnodes) == 0:
return 'Link'
textl = map(lambda x: x.text, tnodes)
return ''.join(textl)
def findTarget(self, node):
"""
Given an index entry, find the text of the last heading section
preceding the entry.
To do this, find the containing w:p element. If it is a heading,
return the text.
Otherwise, go up the document level by level, staring with the
parent of the w:p element containing the entry.
At each level, get the list of heading w:p elements which are
children of the top node. We also have the index in the top node
of the child node containing the entry.
Find the largest index of a heading child which is < the entry
index, if any - that is the heading we want.
Perhaps we should precalculate some of this.
We could also consider doing some of this in xpath, but the style
attributes have been modified, so we can't just look for the
original names.
"""
pnode = ancestor(node, 'w:p')
if self.isHeading(pnode):
return self.textValue(pnode)
while True:
parent = pnode.getparent()
if parent == None:
return 'Link'
# Maintain document order in these lists
pindex = parent.index(pnode)
hlist = self.getHeadings(parent)
hlist = filter(lambda x: parent.index(x) < pindex, hlist)
if len(hlist) > 0:
return self.textValue(hlist[-1])
# Try again
pnode = parent
def bookmarks(self):
"""
For each index entry we need to insert a bookmark at the target location.
These bookmarks are for our internal use - I'm not sure they would work well
in the original docx document.
For each entry we have the Field object, which includes the instrText
element of the document.
Try going to the parent, and inserting a bookmark start just before it.
"""
bmno = 0
for entry in self.entries:
for instnode in entry[1].elements:
name = 'indexBookmark' + str(bmno)
bmno += 1
tag = "{%s}bookmarkStart" % namespaces['w']
att = "{%s}name" % namespaces['w']
bookmark = lxml.etree.Element(tag)
bookmark.set(att, name)
rnode = instnode.getparent()
# Add the name so that we can link to it
entry.append(name)
# insert the bookmark before rnode
rparent = rnode.getparent()
rind = rparent.index(rnode)
rparent.insert(rind, bookmark)
# We want the index entry to be the content of the closest
# preceding Heading paragraph.
# We should make the targets configurable, and add chapter
# titles and maybe other things.
# What about numbering?
targnode = self.findTarget(rnode)
entry.append(targnode)
def genStyles(self):
"""
Generate css styles for the index elements.
We do title, section header, and three levels of entries.
These are reasonable styles which only set a couple of key
values, but we could provide an interface to allow the user to set them.
Is there any problem registering the styles this early in the
conversion process?
"""
# The result is a string we can use as a class name.
css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
self.titleStyle = self.convert.styles.register(css, 'block')
css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
self.sectionStyle = self.convert.styles.register(css, 'block')
self.entryStyles = []
for i in range(3):
indent = str(i*20) + 'pt'
css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
self.entryStyles.append(self.convert.styles.register(css, 'block'))
def findSection(self, tag):
"""
Find the section for this index entry, creating it if required.
The tag has a form like A or A:B or etc.
If you want a single index without section divisions, you can
just return the single section here every time.
"""
shead = tag[0]
# Make it lower case, and group all non-alphabetic things together
if shead.isalpha():
shead = shead.lower()
else:
shead = ''
if shead in self.sections:
return self.sections[shead]
sect = Section(self)
self.sections[shead] = sect
return sect
def generate(self):
"""
We generated the index object in the constructor.
This method writes it into the html.
"""
body = self.convert.body
body.append(Index.addName('Index', self.titleStyle))
# And write them to the html
for key in sorted(self.sections.keys()):
self.sections[key].toHtml(key, body, self.convert.anchor_map)
@staticmethod
def addName(str, clname):
# Put this into the convert document map?
dest = P()
dest.set('class', clname)
span = SPAN()
from calibre.ebooks.docx.to_html import Text
text = Text(span, 'text', [])
text.buf.append(str)
setattr(text.elem, text.attr, ''.join(text.buf))
dest.append(span)
return dest
@staticmethod
def findEntry(value, dict, index):
"""
Find the Entry in the dictionary, or create a new one.
We convert to lower case to group all capitalizations
together as a single entry.
"""
lvalue = value.lower()
if lvalue in dict:
return dict[lvalue]
ent = Entry(value, index)
dict[lvalue] = ent
return ent

View File

@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
from calibre.ebooks.docx.index import Index
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -44,12 +45,13 @@ class Text:
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None):
self.docx = DOCX(path_or_stream, log=log)
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
self.detect_cover = detect_cover
self.do_index = do_index
self.notes_text = notes_text or _('Notes')
self.dest_dir = dest_dir or os.getcwdu()
self.mi = self.docx.metadata
@ -97,6 +99,14 @@ class Convert(object):
paras = []
self.log.debug('Converting Word markup to HTML')
# If we are doing an index, do the body part of the processing here.
# We need to insert bookmarks at the indexed locations before the
# main conversion work.
if self.do_index:
self.log.debug('Generating index')
index = Index(self)
self.read_page_properties(doc)
self.current_rels = relationships_by_id
for wp, page_properties in self.page_map.iteritems():
@ -105,6 +115,7 @@ class Convert(object):
p = self.convert_p(wp)
self.body.append(p)
paras.append(wp)
self.read_block_anchors(doc)
self.styles.apply_contextual_spacing(paras)
# Apply page breaks at the start of every section, except the first
@ -157,6 +168,10 @@ class Convert(object):
parent.text = tabs[-1].tail or ''
map(parent.remove, tabs)
# For an index, we now want to append the index object
if self.do_index:
index.generate()
self.images.rid_map = orig_rid_map
self.resolve_links()

View File

@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['docx_no_cover', ])
['docx_no_cover', 'docx_index', ])
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -21,6 +21,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_docx_index">
<property name="text">
<string>Generate an alphabetical index from embedded index markers</string>
</property>
</widget>
</item>
<item>
<spacer name="verticalSpacer">
<property name="orientation">