diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py
index 6617728f0c..90e80423ff 100644
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@@ -16,6 +16,7 @@ class Field(object):
self.start = start
self.end = None
self.contents = []
+ self.elements = []
self.instructions = []
def add_instr(self, elem):
@@ -24,6 +25,7 @@ class Field(object):
return
name, rest = raw.strip().partition(' ')[0::2]
self.instructions.append((name, rest.strip()))
+ self.elements.append(elem)
WORD, FLAG = 0, 1
scanner = re.Scanner([
@@ -33,25 +35,45 @@ scanner = re.Scanner([
(r'\s+', None),
], flags=re.DOTALL)
+null = object()
-def parse_hyperlink(raw, log):
- ans = {}
- last_option = None
- raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
- for token, token_type in scanner.scan(raw)[0]:
- token = token.replace('\x01', '\\').replace('\x02', '"')
- if token_type is FLAG:
- last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
- if last_option is not None:
- ans[last_option] = None
- elif token_type is WORD:
- if last_option is None:
- ans['url'] = token
- else:
- ans[last_option] = token
- last_option = None
- return ans
+def parser(name, field_map, default_field_name=None):
+ field_map = dict((x.split(':') for x in field_map.split()))
+
+ def parse(raw, log=None):
+ ans = {}
+ last_option = None
+ raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
+ for token, token_type in scanner.scan(raw)[0]:
+ token = token.replace('\x01', '\\').replace('\x02', '"')
+ if token_type is FLAG:
+ last_option = field_map.get(token[1], null)
+ if last_option is not None:
+ ans[last_option] = None
+ elif token_type is WORD:
+ if last_option is None:
+ ans[default_field_name] = token
+ else:
+ ans[last_option] = token
+ last_option = None
+ ans.pop(null, None)
+ return ans
+
+ parse.__name__ = str('parse_' + name)
+
+ return parse
+
+parse_hyperlink = parser('hyperlink',
+ 'l:anchor m:image-map n:target o:title t:target', 'url')
+
+parse_xe = parser('xe',
+ 'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
+
+parse_index = parser('index',
+ 'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
+ ' f:entry-type g:page-range-separator h:heading k:crossref-separator'
+ ' p:page-number-separator r:run-together y:yomi z:langcode')
class Fields(object):
@@ -79,44 +101,83 @@ class Fields(object):
if stack:
stack[-1].contents.append(elem)
- # Parse hyperlink fields
- self.hyperlink_fields = []
- for field in self.fields:
- if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
- hl = parse_hyperlink(field.instructions[0][1], log)
- if hl:
- if 'target' in hl and hl['target'] is None:
- hl['target'] = '_blank'
- all_runs = []
- current_runs = []
- # We only handle spans in a single paragraph
- # being wrapped in
- for x in field.contents:
- if x.tag.endswith('}p'):
- if current_runs:
- all_runs.append(current_runs)
- current_runs = []
- elif x.tag.endswith('}r'):
- current_runs.append(x)
- if current_runs:
- all_runs.append(current_runs)
- for runs in all_runs:
- self.hyperlink_fields.append((hl, runs))
+ field_types = ('hyperlink', 'xe', 'index')
+ parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
+ field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
-def test_parse_hyperlink():
+ for f in field_types:
+ setattr(self, '%s_fields' % f, [])
+
+ for field in self.fields:
+ if field.instructions:
+ name = field.instructions[0][0]
+ func = parsers.get(name, None)
+ if func is not None:
+ func(field, field_parsers[name], log)
+
+ def parse_hyperlink(self, field, parse_func, log):
+ # Parse hyperlink fields
+ if len(field.instructions) == 1:
+ hl = parse_func(field.instructions[0][1], log)
+ if hl:
+ if 'target' in hl and hl['target'] is None:
+ hl['target'] = '_blank'
+ all_runs = []
+ current_runs = []
+ # We only handle spans in a single paragraph
+ # being wrapped in
+ for x in field.contents:
+ if x.tag.endswith('}p'):
+ if current_runs:
+ all_runs.append(current_runs)
+ current_runs = []
+ elif x.tag.endswith('}r'):
+ current_runs.append(x)
+ if current_runs:
+ all_runs.append(current_runs)
+ for runs in all_runs:
+ self.hyperlink_fields.append((hl, runs))
+
+ def parse_xe(self, field, parse_func, log):
+ # Parse XE fields
+ xe = parse_func(field.instructions[0][1], log) # TODO: Handle field with multiple instructions
+ if xe:
+ # TODO: parse the field contents
+ self.xe_fields.append(xe)
+
+ def parse_index(self, field, parse_func, log):
+ # Parse Index fields
+ if len(field.instructions):
+ idx = parse_func(field.instructions[0][1], log)
+ # TODO: parse the field contents
+ self.index_fields.append(idx)
+
+def test_parse_fields():
import unittest
- class TestParseHyperLink(unittest.TestCase):
+ class TestParseFields(unittest.TestCase):
- def test_parsing(self):
- self.assertEqual(parse_hyperlink(
- r'\l anchor1', None), {'anchor':'anchor1'})
- self.assertEqual(parse_hyperlink(
- r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'})
- self.assertEqual(parse_hyperlink(
- r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
- self.assertEqual(parse_hyperlink(
- r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'})
+ def test_hyperlink(self):
+ ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
+ ae(r'\l anchor1', {'anchor':'anchor1'})
+ ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
+ ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
+ ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
+ ae(r'xxxx \y yyyy', {'url': 'xxxx'})
- suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink)
+ def test_xe(self):
+ ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
+ ae(r'"some name"', {'text':'some name'})
+ ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
+ ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
+
+ def test_index(self):
+ ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
+ ae(r'', {})
+ ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
+
+ suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
unittest.TextTestRunner(verbosity=4).run(suite)
+
+if __name__ == '__main__':
+ test_parse_fields()
diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py
new file mode 100644
index 0000000000..3e7e9dc42f
--- /dev/null
+++ b/src/calibre/ebooks/docx/index.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal '
+
+from collections import OrderedDict
+
+from lxml.html.builder import A, SPAN
+import lxml.etree
+
+from calibre.ebooks.docx.names import XPath, ancestor, namespaces
+
+
+NBSP = '\xa0'
+
+class Location(object):
+ r"""
+ This class represents one location in the index.
+ We should provide a way to mark the main entries. Libre office
+ has a main attribute, which doesn't seem to map to docx, and at least
+ some versions of word can mark entries bold or italic with \b and \i.
+ One index entry corresponds to a list of locations where the entry
+ is referenced in the text.
+ """
+
+ def __init__(self, bookmark, target):
+ self.bookmark = bookmark
+ self.target = target
+
+class Entry(object):
+ """
+ This class represents one index entry.
+ We can also have a list of sub-entries for the primary/secondary
+ topic situation.
+ Each entry has a list of locations we want to point to, but
+ it could be empty if this is only here to organize sub-entries.
+ """
+
+ def __init__(self, name, index):
+ self.subentries = {}
+ self.locations = []
+ self.name = name
+ self.index = index
+
+ def add_entry(self, entry, sub):
+ """
+ The entry has the form [xxx, field, bookmark, target]
+ """
+ if len(sub) == 0:
+ self.locations.append(Location(entry[2], entry[3]))
+ else:
+ sube = find_entry(sub[0], self.subentries, self.index)
+ sube.add_entry(entry, sub[1:])
+
+ def make_link(self, loc, amap):
+ # As a first pass, we just put a placeholder in the target location
+ # We want it to float right
+ markid = amap[loc.bookmark]
+ if markid is None:
+ return
+
+ span = A()
+ span.set('style', 'float:right')
+ span.set('href', '#' + markid)
+ from calibre.ebooks.docx.to_html import Text
+ text = Text(span, 'text', [])
+ text.buf.append(loc.target)
+ setattr(text.elem, text.attr, ''.join(text.buf))
+ return span
+
+ def to_htmlunit(self, body, level, amap):
+ """
+ Append the material for one index entry to the document.
+ There is a name, and 0 or more locations.
+ Put the first location, if any, on the same line as the
+ name, and others on following lines.
+ """
+ style = self.index.entry_styles[level]
+ main = add_name(self.name, style)
+ if len(self.locations) == 0:
+ body.append(main)
+ return
+
+ # First link on same line as name
+ link = self.make_link(self.locations[0], amap)
+ main.append(link)
+ body.append(main)
+
+ # Put other links for same entry on their own lines
+ # To keep the link span separate need to put a space as the name
+ for l in self.locations[1:]:
+ link = self.make_link(l, amap)
+ dest = P()
+ dest.set('class', style)
+ dest.text = NBSP
+ dest.append(link)
+ body.append(dest)
+
+ def to_html(self, body, level, amap):
+ level = min(level, 2)
+ self.to_htmlunit(body, level, amap)
+ for key in sorted(self.subentries.keys()):
+ self.subentries[key].to_html(body, level + 1, amap)
+
+class Section(object):
+ """
+ This class represents one section of the index - usually,
+ for example, the A's or the B's.
+ It is primarily a dictionary of entries.
+ """
+
+ def __init__(self, index):
+ self.index = index
+ self.entries = {}
+
+ def add_entry(self, entry):
+ """
+ We have information from one index marker.
+ The entry has form [name, field, bookmark, target].
+ The name is something like A or A:B and so on.
+ If we already have an entry for that name, just add the new
+ location to it; otherwise create a new entry.
+ """
+ topics = entry[0].strip('"').split(':')
+ targ = find_entry(topics[0], self.entries, self.index)
+ targ.add_entry(entry, topics[1:])
+
+ def to_html(self, key, body, amap):
+ """
+ Add one section of the index to the html
+ """
+ if len(key) > 0:
+ body.append(add_name(key, self.index.section_style))
+ for ekey in sorted(self.entries.keys()):
+ self.entries[ekey].to_html(body, 0, amap)
+
+class Index(object):
+ """
+ This class generates an alphabetical index from the index markers in a docx file.
+
+ Each field in the parse of the docx file contains an instructions list.
+ Instructions with name XE are index instructions.
+ The instruction also contains the entry specifier, of the form A[:B[:C]] for
+ main entry, A, subentry B, and so on.
+
+ The index object is a dictionary of sections, 'A' mapping to a section
+ object with all the A entries, and so on. Each section in turn is a dictionary
+ mapping an index specifier, like A:B, to a list of locations where that
+ entry is referenced.
+
+ We could make the formatting more configurable.
+ Currently it uses fixed styles for the various elements, and a section
+ heading for each letter.
+ """
+
+ def __init__(self, convert):
+ """
+ Convert the index markers in the document into an index object.
+ """
+ self.convert = convert
+ self.sections = {}
+
+ self.gen_styles()
+
+ # Get a list of [name, field] entries, where name is the index
+ # entry and field is the indexed location
+ self.entries = self.get_entries()
+
+ # Find styles which are provide the text for links.
+ self.target_styles()
+
+ # Generate bookmarks in the document at the indexed locations
+ self.bookmarks()
+
+ # Set up the entries in index sections
+ for unit in self.entries:
+ sec = self.find_section(unit[0])
+ sec.add_entry(unit)
+
+ def get_entries(self):
+ r"""
+ We already have a list of fields which includes the index marks,
+ identified by an XE tag.
+ In the base case, the field object includes an instruction list
+ with one tuple like ('XE', '"entry"'), where entry is the text we
+ want to put in the index. Note the double quotes around the entry.
+ Sometimes the entry is broken up in the document, for example if
+ there are spelling issues in the entry text.
+ In this case, for reasons I don't understand, the instruction
+ list includes a number of tuples, and we get the actual entry
+ text by concatenating all of them after the initial tag.
+ There can be formatting information in the instructions also, after
+ the double quoted part, like '"entry" \b'.
+ So, we want to concatenate all parts after the initial tag, and
+ then get the part in double quotes.
+ """
+ fields = self.convert.fields.fields
+
+ def get_entry(field):
+ elist = [field.instructions[0][1]]
+ for inst in field.instructions[1:]:
+ elist.append(inst[0])
+ elist.append(inst[1])
+
+ entry = ''.join(elist)
+ sep1 = entry.partition('"')
+ if sep1[2] == '':
+ return entry
+ sep2 = sep1[2].partition('"')
+ return sep2[0]
+
+ # Only want the index entries
+ return [[get_entry(f), f] for f in fields
+ if f.instructions and f.instructions[0][0] == 'XE']
+
+ def target_styles(self):
+ """
+ We want to get a list of styles which represent valid index targets.
+ That is, the text of a link in the index will be the title of the
+ section of the document containing the indexed location.
+ We want the list of styles which can provide a valid title.
+ In practice, this maps to Heading1 through Heading3 in the original document.
+ Calibre apparently preprocesses docx files, so that a paragraph in
+ the original with style Heading1 will now have a different, internal style.
+ In this version we use convert.styles.id_map to find style ids
+ with internal names beginning Heading; but I'd feel better if we
+ jumped in earlier and could map it to the original docx styles.
+ """
+ smap = self.convert.styles.id_map
+ self.targstyles = [name for name, style in smap.iteritems() if style.name.lower().startswith('heading')]
+
+ def is_heading(self, node):
+ """
+ Return true if the input node is a valid index link target.
+ """
+ snodes = XPath("./w:pPr/w:pStyle")(node)
+ if len(snodes) == 0:
+ return False
+
+ sn = snodes[0]
+
+ # The key includes the long namespace information
+ k = [key for key in sn.keys() if key.endswith('}val')]
+ if len(k) == 0:
+ return False
+ style = sn.get(k[0])
+ return style in self.targstyles
+
+ def get_headings(self, node):
+ """
+ Get a list of all children of the input node which are headings -
+ that is, valid targets for an index link
+ """
+ answer = []
+ for c in node.getchildren():
+ if self.is_heading(c):
+ answer.append(c)
+ return answer
+
+ def text_value(self, node):
+ tnodes = XPath("./w:r/w:t")(node)
+ if len(tnodes) == 0:
+ return 'Link'
+ return ''.join((x.text or '') for x in tnodes)
+
+ def find_target(self, node):
+ """
+ Given an index entry, find the text of the last heading section
+ preceding the entry.
+ To do this, find the containing w:p element. If it is a heading,
+ return the text.
+ Otherwise, go up the document level by level, staring with the
+ parent of the w:p element containing the entry.
+ At each level, get the list of heading w:p elements which are
+ children of the top node. We also have the index in the top node
+ of the child node containing the entry.
+ Find the largest index of a heading child which is < the entry
+ index, if any - that is the heading we want.
+ Perhaps we should precalculate some of this.
+ We could also consider doing some of this in xpath, but the style
+ attributes have been modified, so we can't just look for the
+ original names.
+ """
+ pnode = ancestor(node, 'w:p')
+ if self.is_heading(pnode):
+ return self.text_value(pnode)
+
+ while True:
+ parent = pnode.getparent()
+ if parent is None:
+ return 'Link'
+
+ # Maintain document order in these lists
+ pindex = parent.index(pnode)
+ hlist = self.get_headings(parent)
+ hlist = filter(lambda x: parent.index(x) < pindex, hlist)
+ if len(hlist) > 0:
+ return self.text_value(hlist[-1])
+
+ # Try again
+ pnode = parent
+
+ def bookmarks(self):
+ """
+ For each index entry we need to insert a bookmark at the target location.
+ These bookmarks are for our internal use - I'm not sure they would work well
+ in the original docx document.
+ For each entry we have the Field object, which includes the instrText
+ element of the document.
+ Try going to the parent, and inserting a bookmark start just before it.
+ """
+ bmno = 0
+ for entry in self.entries:
+ for instnode in entry[1].elements:
+ name = 'indexBookmark' + str(bmno)
+ bmno += 1
+ tag = "{%s}bookmarkStart" % namespaces['w']
+ att = "{%s}name" % namespaces['w']
+ bookmark = lxml.etree.Element(tag)
+ bookmark.set(att, name)
+ rnode = instnode.getparent()
+
+ # Add the name so that we can link to it
+ entry.append(name)
+
+ # insert the bookmark before rnode
+ rparent = rnode.getparent()
+ rind = rparent.index(rnode)
+ rparent.insert(rind, bookmark)
+
+ # We want the index entry to be the content of the closest
+ # preceding Heading paragraph.
+ # We should make the targets configurable, and add chapter
+ # titles and maybe other things.
+ # What about numbering?
+ targnode = self.find_target(rnode)
+ entry.append(targnode)
+
+ def gen_styles(self):
+ """
+ Generate css styles for the index elements.
+ We do title, section header, and three levels of entries.
+ These are reasonable styles which only set a couple of key
+ values, but we could provide an interface to allow the user to set them.
+ Is there any problem registering the styles this early in the
+ conversion process?
+ """
+ # The result is a string we can use as a class name.
+ css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
+ self.title_style = self.convert.styles.register(css, 'block')
+
+ css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
+ self.section_style = self.convert.styles.register(css, 'block')
+
+ self.entry_styles = []
+ for i in range(3):
+ indent = str(i*20) + 'pt'
+ css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
+ self.entry_styles.append(self.convert.styles.register(css, 'block'))
+
+ def find_section(self, tag):
+ """
+ Find the section for this index entry, creating it if required.
+ The tag has a form like A or A:B or etc.
+ If you want a single index without section divisions, you can
+ just return the single section here every time.
+ """
+ shead = tag[0]
+
+ # Make it lower case, and group all non-alphabetic things together
+ if shead.isalpha():
+ shead = shead.lower()
+ else:
+ shead = ''
+
+ if shead in self.sections:
+ return self.sections[shead]
+ sect = Section(self)
+ self.sections[shead] = sect
+ return sect
+
+ def generate(self):
+ """
+ We generated the index object in the constructor.
+ This method writes it into the html.
+ """
+ # TODO: Only do this at locations of the INDEX field in the document
+ body = self.convert.body
+ body.append(add_name('Index', self.title_style))
+
+ # And write them to the html
+ for key in sorted(self.sections.keys()):
+ self.sections[key].to_html(key, body, self.convert.anchor_map)
+
+def add_name(str, clname):
+ # Put this into the convert document map?
+ dest = P()
+ dest.set('class', clname)
+ span = SPAN()
+ from calibre.ebooks.docx.to_html import Text
+ text = Text(span, 'text', [])
+ text.buf.append(str)
+ setattr(text.elem, text.attr, ''.join(text.buf))
+ dest.append(span)
+ return dest
+
+def find_entry(value, dict, index):
+ """
+ Find the Entry in the dictionary, or create a new one.
+ We convert to lower case to group all capitalizations
+ together as a single entry.
+ """
+ lvalue = value.lower()
+ if lvalue in dict:
+ return dict[lvalue]
+ ent = Entry(value, index)
+ dict[lvalue] = ent
+ return ent
diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py
index 1b1622d219..9d09e423f2 100644
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
+# from calibre.ebooks.docx.index import Index
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@@ -97,6 +98,12 @@ class Convert(object):
paras = []
self.log.debug('Converting Word markup to HTML')
+
+ # If we are doing an index, do the body part of the processing here.
+ # We need to insert bookmarks at the indexed locations before the
+ # main conversion work.
+ # index = Index(self)
+
self.read_page_properties(doc)
self.current_rels = relationships_by_id
for wp, page_properties in self.page_map.iteritems():
@@ -105,6 +112,7 @@ class Convert(object):
p = self.convert_p(wp)
self.body.append(p)
paras.append(wp)
+
self.read_block_anchors(doc)
self.styles.apply_contextual_spacing(paras)
# Apply page breaks at the start of every section, except the first
@@ -161,6 +169,9 @@ class Convert(object):
self.resolve_links()
+ # For an index, we now want to append the index object
+ # index.generate()
+
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map, self.page_map)