mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
This material adds the option to generate an alphabetical index from index markers in a docx file.
This commit is contained in:
parent
290462909f
commit
f790038819
@ -19,6 +19,9 @@ class DOCXInput(InputFormatPlugin):
|
|||||||
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
|
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
|
||||||
'it will be removed from the document and used as the cover for created ebook. This option '
|
'it will be removed from the document and used as the cover for created ebook. This option '
|
||||||
'turns off that behavior.')),
|
'turns off that behavior.')),
|
||||||
|
OptionRecommendation(name='docx_index', recommended_value=False,
|
||||||
|
help=_('If there are embedded index markers in the document, this option will use them to create '
|
||||||
|
'an alphabetical index with links to the locations of the markers.')),
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -26,5 +29,5 @@ class DOCXInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
from calibre.ebooks.docx.to_html import Convert
|
from calibre.ebooks.docx.to_html import Convert
|
||||||
return Convert(stream, detect_cover=not options.docx_no_cover, log=log)()
|
return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)()
|
||||||
|
|
||||||
|
@ -22,7 +22,8 @@ from calibre.utils.zipfile import ZipFile
|
|||||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
||||||
|
|
||||||
def fromstring(raw, parser=RECOVER_PARSER):
|
def fromstring(raw, parser=RECOVER_PARSER):
|
||||||
return etree.fromstring(raw, parser=parser)
|
res = etree.fromstring(raw, parser=parser)
|
||||||
|
return res
|
||||||
|
|
||||||
# Read metadata {{{
|
# Read metadata {{{
|
||||||
def read_doc_props(raw, mi):
|
def read_doc_props(raw, mi):
|
||||||
|
@ -10,12 +10,15 @@ import re
|
|||||||
|
|
||||||
from calibre.ebooks.docx.names import XPath, get
|
from calibre.ebooks.docx.names import XPath, get
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
class Field(object):
|
class Field(object):
|
||||||
|
|
||||||
def __init__(self, start):
|
def __init__(self, start):
|
||||||
self.start = start
|
self.start = start
|
||||||
self.end = None
|
self.end = None
|
||||||
self.contents = []
|
self.contents = []
|
||||||
|
self.elements = []
|
||||||
self.instructions = []
|
self.instructions = []
|
||||||
|
|
||||||
def add_instr(self, elem):
|
def add_instr(self, elem):
|
||||||
@ -24,6 +27,7 @@ class Field(object):
|
|||||||
return
|
return
|
||||||
name, rest = raw.strip().partition(' ')[0::2]
|
name, rest = raw.strip().partition(' ')[0::2]
|
||||||
self.instructions.append((name, rest.strip()))
|
self.instructions.append((name, rest.strip()))
|
||||||
|
self.elements.append(elem)
|
||||||
|
|
||||||
WORD, FLAG = 0, 1
|
WORD, FLAG = 0, 1
|
||||||
scanner = re.Scanner([
|
scanner = re.Scanner([
|
||||||
|
427
src/calibre/ebooks/docx/index.py
Normal file
427
src/calibre/ebooks/docx/index.py
Normal file
@ -0,0 +1,427 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
from collections import OrderedDict
|
||||||
|
from lxml import html
|
||||||
|
from lxml.html.builder import (
|
||||||
|
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1)
|
||||||
|
from calibre.ebooks.docx.names import (
|
||||||
|
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
|
||||||
|
ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
|
||||||
|
|
||||||
|
import lxml.etree
|
||||||
|
|
||||||
|
NBSP = '\xa0'
|
||||||
|
|
||||||
|
class Location:
|
||||||
|
"""
|
||||||
|
This class represents one location in the index.
|
||||||
|
We should provide a way to mark the main entries. Libre office
|
||||||
|
has a main attribute, which doesn't seem to map to docx, and at least
|
||||||
|
some versions of word can mark entries bold or italic with \b and \i.
|
||||||
|
One index entry corresponds to a list of locations where the entry
|
||||||
|
is referenced in the text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, bookmark, target):
|
||||||
|
self.bookmark = bookmark
|
||||||
|
self.target = target
|
||||||
|
|
||||||
|
class Entry:
|
||||||
|
"""
|
||||||
|
This class represents one index entry.
|
||||||
|
We can also have a list of subentries for the primary/secondary
|
||||||
|
topic situation.
|
||||||
|
Each entry has a list of locations we want to point to, but
|
||||||
|
it could be empty if this is only here to organize subentries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name, index):
|
||||||
|
self.subentries = {}
|
||||||
|
self.locations = []
|
||||||
|
self.name = name
|
||||||
|
self.index = index
|
||||||
|
|
||||||
|
def addEntry(self, entry, sub):
|
||||||
|
"""
|
||||||
|
The entry has the form [xxx, field, bookmark, target]
|
||||||
|
"""
|
||||||
|
if len(sub) == 0:
|
||||||
|
self.locations.append(Location(entry[2], entry[3]))
|
||||||
|
else:
|
||||||
|
sube = Index.findEntry(sub[0], self.subentries, self.index)
|
||||||
|
sube.addEntry(entry, sub[1:])
|
||||||
|
|
||||||
|
def makeLink(self, loc, amap):
|
||||||
|
# As a first pass, we just put a placeholder in the target location
|
||||||
|
# We want it to float right
|
||||||
|
markid = amap[loc.bookmark]
|
||||||
|
if markid == None:
|
||||||
|
return
|
||||||
|
|
||||||
|
span = A()
|
||||||
|
span.set('style', 'float:right')
|
||||||
|
span.set('href', '#' + markid)
|
||||||
|
from calibre.ebooks.docx.to_html import Text
|
||||||
|
text = Text(span, 'text', [])
|
||||||
|
text.buf.append(loc.target)
|
||||||
|
setattr(text.elem, text.attr, ''.join(text.buf))
|
||||||
|
return span
|
||||||
|
|
||||||
|
def toHtmlUnit(self, body, level, amap):
|
||||||
|
"""
|
||||||
|
Append the material for one index entry to the document.
|
||||||
|
There is a name, and 0 or more locations.
|
||||||
|
Put the first location, if any, on the same line as the
|
||||||
|
name, and others on following lines.
|
||||||
|
"""
|
||||||
|
style = self.index.entryStyles[level]
|
||||||
|
main = Index.addName(self.name, style)
|
||||||
|
if len(self.locations) == 0:
|
||||||
|
body.append(main)
|
||||||
|
return
|
||||||
|
|
||||||
|
# First link on same line as name
|
||||||
|
link = self.makeLink(self.locations[0], amap)
|
||||||
|
main.append(link)
|
||||||
|
body.append(main)
|
||||||
|
|
||||||
|
# Put other links for same entry on their own lines
|
||||||
|
# To keep the link span separate need to put a space as the name
|
||||||
|
for l in self.locations[1:]:
|
||||||
|
link = self.makeLink(l, amap)
|
||||||
|
dest = P()
|
||||||
|
dest.set('class', style)
|
||||||
|
dest.text = NBSP
|
||||||
|
dest.append(link)
|
||||||
|
body.append(dest)
|
||||||
|
|
||||||
|
def toHtml(self, body, level, amap):
|
||||||
|
level = min(level, 2)
|
||||||
|
self.toHtmlUnit(body, level, amap)
|
||||||
|
for key in sorted(self.subentries.keys()):
|
||||||
|
self.subentries[key].toHtml(body, level + 1, amap)
|
||||||
|
|
||||||
|
class Section:
|
||||||
|
"""
|
||||||
|
This class represents one section of the index - usually,
|
||||||
|
for example, the A's or the B's.
|
||||||
|
It is primarily a dictionary of entries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, index):
|
||||||
|
self.index = index
|
||||||
|
self.entries = {}
|
||||||
|
|
||||||
|
def addEntry(self, entry):
|
||||||
|
"""
|
||||||
|
We have information from one index marker.
|
||||||
|
The entry has form [name, field, bookmark, target].
|
||||||
|
The name is something like A or A:B and so on.
|
||||||
|
If we already have an entry for that name, just add the new
|
||||||
|
location to it; otherwise create a new entry.
|
||||||
|
"""
|
||||||
|
topics = entry[0].strip('"').split(':')
|
||||||
|
targ = Index.findEntry(topics[0], self.entries, self.index)
|
||||||
|
targ.addEntry(entry, topics[1:])
|
||||||
|
|
||||||
|
def toHtml(self, key, body, amap):
|
||||||
|
"""
|
||||||
|
Add one section of the index to the html
|
||||||
|
"""
|
||||||
|
if len(key) > 0:
|
||||||
|
body.append(Index.addName(key, self.index.sectionStyle))
|
||||||
|
for ekey in sorted(self.entries.keys()):
|
||||||
|
self.entries[ekey].toHtml(body, 0, amap)
|
||||||
|
|
||||||
|
class Index:
|
||||||
|
"""
|
||||||
|
This class generates an alphabetical index from the index markers in a docx file.
|
||||||
|
|
||||||
|
Each field in the parse of the docx file contains an instructions list.
|
||||||
|
Instructions with name XE are index instructions.
|
||||||
|
The instruction also contains the entry specifier, of the form A[:B[:C]] for
|
||||||
|
main entry, A, subentry B, and so on.
|
||||||
|
|
||||||
|
The index object is a dictionary of sections, 'A' mapping to a section
|
||||||
|
object with all the A entries, and so on. Each section in turn is a dictionary
|
||||||
|
mapping an index specifier, like A:B, to a list of locations where that
|
||||||
|
entry is referenced.
|
||||||
|
|
||||||
|
We could make the formatting more configurable.
|
||||||
|
Currently it uses fixed styles for the various elements, and a section
|
||||||
|
heading for each letter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, convert):
|
||||||
|
"""
|
||||||
|
Convert the index markers in the document into an index object.
|
||||||
|
"""
|
||||||
|
self.convert = convert
|
||||||
|
self.sections = {}
|
||||||
|
|
||||||
|
self.genStyles()
|
||||||
|
|
||||||
|
# Get a list of [name, field] entries, where name is the index
|
||||||
|
# entry and field is the indexed location
|
||||||
|
self.entries = self.getEntries()
|
||||||
|
|
||||||
|
# Find styles which are provide the text for links.
|
||||||
|
self.targetStyles()
|
||||||
|
|
||||||
|
# Generate bookmarks in the document at the indexed locations
|
||||||
|
self.bookmarks()
|
||||||
|
|
||||||
|
# Set up the entries in index sections
|
||||||
|
for unit in self.entries:
|
||||||
|
sec = self.findSection(unit[0])
|
||||||
|
sec.addEntry(unit)
|
||||||
|
|
||||||
|
def getEntries(self):
|
||||||
|
"""
|
||||||
|
We already have a list of fields which includes the index marks,
|
||||||
|
identified by an XE tag.
|
||||||
|
In the base case, the field object includes an instruction list
|
||||||
|
with one tuple like ('XE', '"entry"'), where entry is the text we
|
||||||
|
want to put in the index. Note the double quotes around the entry.
|
||||||
|
Sometimes the entry is broken up in the document, for example if
|
||||||
|
there are spelling issues in the entry text.
|
||||||
|
In this case, for reasons I don't understand, the instruction
|
||||||
|
list includes a number of tuples, and we get the actual entry
|
||||||
|
text by concatenating all of them after the initial tag.
|
||||||
|
There can be formatting information in the instructions also, after
|
||||||
|
the double quoted part, like '"entry" \b'.
|
||||||
|
So, we want to concatenate all parts after the initial tag, and
|
||||||
|
then get the part in double quotes.
|
||||||
|
"""
|
||||||
|
fields = self.convert.fields.fields
|
||||||
|
|
||||||
|
# Only want the index entries
|
||||||
|
fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields)
|
||||||
|
return map(lambda f: [self.getEntry(f), f], fields)
|
||||||
|
|
||||||
|
def getEntry(self, field):
|
||||||
|
|
||||||
|
elist = [field.instructions[0][1]]
|
||||||
|
for inst in field.instructions[1:]:
|
||||||
|
elist.append(inst[0])
|
||||||
|
elist.append(inst[1])
|
||||||
|
|
||||||
|
entry = ''.join(elist)
|
||||||
|
sep1 = entry.partition('"')
|
||||||
|
if sep1[2] == '':
|
||||||
|
return entry
|
||||||
|
sep2 = sep1[2].partition('"')
|
||||||
|
return sep2[0]
|
||||||
|
|
||||||
|
def targetStyles(self):
|
||||||
|
"""
|
||||||
|
We want to get a list of styles which represent valid index targets.
|
||||||
|
That is, the text of a link in the index will be the title of the
|
||||||
|
section of the document containing the indexed location.
|
||||||
|
We want the list of styles which can provide a valid title.
|
||||||
|
In practice, this maps to Heading1 through Heading3 in the original document.
|
||||||
|
Calibre apparently preprocesses docx files, so that a paragraph in
|
||||||
|
the original with style Heading1 will now have a different, internal style.
|
||||||
|
In this version we use convert.styles.id_map to find style ids
|
||||||
|
with internal names beginning Heading; but I'd feel better if we
|
||||||
|
jumped in earlier and could map it to the original docx styles.
|
||||||
|
"""
|
||||||
|
smap = self.convert.styles.id_map
|
||||||
|
self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')]
|
||||||
|
|
||||||
|
def isHeading(self, node):
|
||||||
|
"""
|
||||||
|
Return true if the input node is a valid index link target.
|
||||||
|
"""
|
||||||
|
snodes = XPath("./w:pPr/w:pStyle")(node)
|
||||||
|
if len(snodes) == 0:
|
||||||
|
return False;
|
||||||
|
|
||||||
|
sn = snodes[0]
|
||||||
|
|
||||||
|
# The key includes the long namespace information
|
||||||
|
k = [key for key in sn.keys() if key.endswith('}val')]
|
||||||
|
if len(k) == 0:
|
||||||
|
return False
|
||||||
|
style = sn.get(k[0])
|
||||||
|
return style in self.targstyles
|
||||||
|
|
||||||
|
def getHeadings(self, node):
|
||||||
|
"""
|
||||||
|
Get a list of all children of the input node which are headings -
|
||||||
|
that is, valid targets for an index link
|
||||||
|
"""
|
||||||
|
answer = []
|
||||||
|
for c in node.getchildren():
|
||||||
|
if self.isHeading(c):
|
||||||
|
answer.append(c)
|
||||||
|
return answer
|
||||||
|
|
||||||
|
def textValue(self, node):
|
||||||
|
tnodes = XPath("./w:r/w:t")(node)
|
||||||
|
if len(tnodes) == 0:
|
||||||
|
return 'Link'
|
||||||
|
textl = map(lambda x: x.text, tnodes)
|
||||||
|
return ''.join(textl)
|
||||||
|
|
||||||
|
def findTarget(self, node):
|
||||||
|
"""
|
||||||
|
Given an index entry, find the text of the last heading section
|
||||||
|
preceding the entry.
|
||||||
|
To do this, find the containing w:p element. If it is a heading,
|
||||||
|
return the text.
|
||||||
|
Otherwise, go up the document level by level, staring with the
|
||||||
|
parent of the w:p element containing the entry.
|
||||||
|
At each level, get the list of heading w:p elements which are
|
||||||
|
children of the top node. We also have the index in the top node
|
||||||
|
of the child node containing the entry.
|
||||||
|
Find the largest index of a heading child which is < the entry
|
||||||
|
index, if any - that is the heading we want.
|
||||||
|
Perhaps we should precalculate some of this.
|
||||||
|
We could also consider doing some of this in xpath, but the style
|
||||||
|
attributes have been modified, so we can't just look for the
|
||||||
|
original names.
|
||||||
|
"""
|
||||||
|
pnode = ancestor(node, 'w:p')
|
||||||
|
if self.isHeading(pnode):
|
||||||
|
return self.textValue(pnode)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
parent = pnode.getparent()
|
||||||
|
if parent == None:
|
||||||
|
return 'Link'
|
||||||
|
|
||||||
|
# Maintain document order in these lists
|
||||||
|
pindex = parent.index(pnode)
|
||||||
|
hlist = self.getHeadings(parent)
|
||||||
|
hlist = filter(lambda x: parent.index(x) < pindex, hlist)
|
||||||
|
if len(hlist) > 0:
|
||||||
|
return self.textValue(hlist[-1])
|
||||||
|
|
||||||
|
# Try again
|
||||||
|
pnode = parent
|
||||||
|
|
||||||
|
def bookmarks(self):
|
||||||
|
"""
|
||||||
|
For each index entry we need to insert a bookmark at the target location.
|
||||||
|
These bookmarks are for our internal use - I'm not sure they would work well
|
||||||
|
in the original docx document.
|
||||||
|
For each entry we have the Field object, which includes the instrText
|
||||||
|
element of the document.
|
||||||
|
Try going to the parent, and inserting a bookmark start just before it.
|
||||||
|
"""
|
||||||
|
bmno = 0
|
||||||
|
for entry in self.entries:
|
||||||
|
for instnode in entry[1].elements:
|
||||||
|
name = 'indexBookmark' + str(bmno)
|
||||||
|
bmno += 1
|
||||||
|
tag = "{%s}bookmarkStart" % namespaces['w']
|
||||||
|
att = "{%s}name" % namespaces['w']
|
||||||
|
bookmark = lxml.etree.Element(tag)
|
||||||
|
bookmark.set(att, name)
|
||||||
|
rnode = instnode.getparent()
|
||||||
|
|
||||||
|
# Add the name so that we can link to it
|
||||||
|
entry.append(name)
|
||||||
|
|
||||||
|
# insert the bookmark before rnode
|
||||||
|
rparent = rnode.getparent()
|
||||||
|
rind = rparent.index(rnode)
|
||||||
|
rparent.insert(rind, bookmark)
|
||||||
|
|
||||||
|
# We want the index entry to be the content of the closest
|
||||||
|
# preceding Heading paragraph.
|
||||||
|
# We should make the targets configurable, and add chapter
|
||||||
|
# titles and maybe other things.
|
||||||
|
# What about numbering?
|
||||||
|
targnode = self.findTarget(rnode)
|
||||||
|
entry.append(targnode)
|
||||||
|
|
||||||
|
def genStyles(self):
|
||||||
|
"""
|
||||||
|
Generate css styles for the index elements.
|
||||||
|
We do title, section header, and three levels of entries.
|
||||||
|
These are reasonable styles which only set a couple of key
|
||||||
|
values, but we could provide an interface to allow the user to set them.
|
||||||
|
Is there any problem registering the styles this early in the
|
||||||
|
conversion process?
|
||||||
|
"""
|
||||||
|
# The result is a string we can use as a class name.
|
||||||
|
css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
|
||||||
|
self.titleStyle = self.convert.styles.register(css, 'block')
|
||||||
|
|
||||||
|
css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
|
||||||
|
self.sectionStyle = self.convert.styles.register(css, 'block')
|
||||||
|
|
||||||
|
self.entryStyles = []
|
||||||
|
for i in range(3):
|
||||||
|
indent = str(i*20) + 'pt'
|
||||||
|
css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
|
||||||
|
self.entryStyles.append(self.convert.styles.register(css, 'block'))
|
||||||
|
|
||||||
|
def findSection(self, tag):
|
||||||
|
"""
|
||||||
|
Find the section for this index entry, creating it if required.
|
||||||
|
The tag has a form like A or A:B or etc.
|
||||||
|
If you want a single index without section divisions, you can
|
||||||
|
just return the single section here every time.
|
||||||
|
"""
|
||||||
|
shead = tag[0]
|
||||||
|
|
||||||
|
# Make it lower case, and group all non-alphabetic things together
|
||||||
|
if shead.isalpha():
|
||||||
|
shead = shead.lower()
|
||||||
|
else:
|
||||||
|
shead = ''
|
||||||
|
|
||||||
|
if shead in self.sections:
|
||||||
|
return self.sections[shead]
|
||||||
|
sect = Section(self)
|
||||||
|
self.sections[shead] = sect
|
||||||
|
return sect
|
||||||
|
|
||||||
|
def generate(self):
|
||||||
|
"""
|
||||||
|
We generated the index object in the constructor.
|
||||||
|
This method writes it into the html.
|
||||||
|
"""
|
||||||
|
body = self.convert.body
|
||||||
|
body.append(Index.addName('Index', self.titleStyle))
|
||||||
|
|
||||||
|
# And write them to the html
|
||||||
|
for key in sorted(self.sections.keys()):
|
||||||
|
self.sections[key].toHtml(key, body, self.convert.anchor_map)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def addName(str, clname):
|
||||||
|
# Put this into the convert document map?
|
||||||
|
dest = P()
|
||||||
|
dest.set('class', clname)
|
||||||
|
span = SPAN()
|
||||||
|
from calibre.ebooks.docx.to_html import Text
|
||||||
|
text = Text(span, 'text', [])
|
||||||
|
text.buf.append(str)
|
||||||
|
setattr(text.elem, text.attr, ''.join(text.buf))
|
||||||
|
dest.append(span)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def findEntry(value, dict, index):
|
||||||
|
"""
|
||||||
|
Find the Entry in the dictionary, or create a new one.
|
||||||
|
We convert to lower case to group all capitalizations
|
||||||
|
together as a single entry.
|
||||||
|
"""
|
||||||
|
lvalue = value.lower()
|
||||||
|
if lvalue in dict:
|
||||||
|
return dict[lvalue]
|
||||||
|
ent = Entry(value, index)
|
||||||
|
dict[lvalue] = ent
|
||||||
|
return ent
|
@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
|
|||||||
from calibre.ebooks.docx.toc import create_toc
|
from calibre.ebooks.docx.toc import create_toc
|
||||||
from calibre.ebooks.docx.fields import Fields
|
from calibre.ebooks.docx.fields import Fields
|
||||||
from calibre.ebooks.docx.settings import Settings
|
from calibre.ebooks.docx.settings import Settings
|
||||||
|
from calibre.ebooks.docx.index import Index
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||||
|
|
||||||
@ -44,12 +45,13 @@ class Text:
|
|||||||
|
|
||||||
class Convert(object):
|
class Convert(object):
|
||||||
|
|
||||||
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
|
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None):
|
||||||
self.docx = DOCX(path_or_stream, log=log)
|
self.docx = DOCX(path_or_stream, log=log)
|
||||||
self.ms_pat = re.compile(r'\s{2,}')
|
self.ms_pat = re.compile(r'\s{2,}')
|
||||||
self.ws_pat = re.compile(r'[\n\r\t]')
|
self.ws_pat = re.compile(r'[\n\r\t]')
|
||||||
self.log = self.docx.log
|
self.log = self.docx.log
|
||||||
self.detect_cover = detect_cover
|
self.detect_cover = detect_cover
|
||||||
|
self.do_index = do_index
|
||||||
self.notes_text = notes_text or _('Notes')
|
self.notes_text = notes_text or _('Notes')
|
||||||
self.dest_dir = dest_dir or os.getcwdu()
|
self.dest_dir = dest_dir or os.getcwdu()
|
||||||
self.mi = self.docx.metadata
|
self.mi = self.docx.metadata
|
||||||
@ -97,6 +99,14 @@ class Convert(object):
|
|||||||
paras = []
|
paras = []
|
||||||
|
|
||||||
self.log.debug('Converting Word markup to HTML')
|
self.log.debug('Converting Word markup to HTML')
|
||||||
|
|
||||||
|
# If we are doing an index, do the body part of the processing here.
|
||||||
|
# We need to insert bookmarks at the indexed locations before the
|
||||||
|
# main conversion work.
|
||||||
|
if self.do_index:
|
||||||
|
self.log.debug('Generating index')
|
||||||
|
index = Index(self)
|
||||||
|
|
||||||
self.read_page_properties(doc)
|
self.read_page_properties(doc)
|
||||||
self.current_rels = relationships_by_id
|
self.current_rels = relationships_by_id
|
||||||
for wp, page_properties in self.page_map.iteritems():
|
for wp, page_properties in self.page_map.iteritems():
|
||||||
@ -105,6 +115,7 @@ class Convert(object):
|
|||||||
p = self.convert_p(wp)
|
p = self.convert_p(wp)
|
||||||
self.body.append(p)
|
self.body.append(p)
|
||||||
paras.append(wp)
|
paras.append(wp)
|
||||||
|
|
||||||
self.read_block_anchors(doc)
|
self.read_block_anchors(doc)
|
||||||
self.styles.apply_contextual_spacing(paras)
|
self.styles.apply_contextual_spacing(paras)
|
||||||
# Apply page breaks at the start of every section, except the first
|
# Apply page breaks at the start of every section, except the first
|
||||||
@ -157,6 +168,10 @@ class Convert(object):
|
|||||||
parent.text = tabs[-1].tail or ''
|
parent.text = tabs[-1].tail or ''
|
||||||
map(parent.remove, tabs)
|
map(parent.remove, tabs)
|
||||||
|
|
||||||
|
# For an index, we now want to append the index object
|
||||||
|
if self.do_index:
|
||||||
|
index.generate()
|
||||||
|
|
||||||
self.images.rid_map = orig_rid_map
|
self.images.rid_map = orig_rid_map
|
||||||
|
|
||||||
self.resolve_links()
|
self.resolve_links()
|
||||||
|
@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):
|
|||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||||
Widget.__init__(self, parent,
|
Widget.__init__(self, parent,
|
||||||
['docx_no_cover', ])
|
['docx_no_cover', 'docx_index', ])
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
|
|
||||||
|
@ -21,6 +21,13 @@
|
|||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
<item>
|
||||||
|
<widget class="QCheckBox" name="opt_docx_index">
|
||||||
|
<property name="text">
|
||||||
|
<string>Generate an alphabetical index from embedded index markers</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
<item>
|
<item>
|
||||||
<spacer name="verticalSpacer">
|
<spacer name="verticalSpacer">
|
||||||
<property name="orientation">
|
<property name="orientation">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user