Merge branch 'index'

This commit is contained in:
Kovid Goyal 2014-03-30 21:52:46 +05:30
commit c2d94d518f
3 changed files with 545 additions and 52 deletions

View File

@ -16,6 +16,7 @@ class Field(object):
self.start = start
self.end = None
self.contents = []
self.elements = []
self.instructions = []
def add_instr(self, elem):
@ -24,6 +25,7 @@ class Field(object):
return
name, rest = raw.strip().partition(' ')[0::2]
self.instructions.append((name, rest.strip()))
self.elements.append(elem)
WORD, FLAG = 0, 1
scanner = re.Scanner([
@ -33,25 +35,45 @@ scanner = re.Scanner([
(r'\s+', None),
], flags=re.DOTALL)
null = object()
def parse_hyperlink(raw, log):
ans = {}
last_option = None
raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
for token, token_type in scanner.scan(raw)[0]:
token = token.replace('\x01', '\\').replace('\x02', '"')
if token_type is FLAG:
last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
if last_option is not None:
ans[last_option] = None
elif token_type is WORD:
if last_option is None:
ans['url'] = token
else:
ans[last_option] = token
last_option = None
return ans
def parser(name, field_map, default_field_name=None):
field_map = dict((x.split(':') for x in field_map.split()))
def parse(raw, log=None):
ans = {}
last_option = None
raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
for token, token_type in scanner.scan(raw)[0]:
token = token.replace('\x01', '\\').replace('\x02', '"')
if token_type is FLAG:
last_option = field_map.get(token[1], null)
if last_option is not None:
ans[last_option] = None
elif token_type is WORD:
if last_option is None:
ans[default_field_name] = token
else:
ans[last_option] = token
last_option = None
ans.pop(null, None)
return ans
parse.__name__ = str('parse_' + name)
return parse
parse_hyperlink = parser('hyperlink',
'l:anchor m:image-map n:target o:title t:target', 'url')
parse_xe = parser('xe',
'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
parse_index = parser('index',
'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
' f:entry-type g:page-range-separator h:heading k:crossref-separator'
' p:page-number-separator r:run-together y:yomi z:langcode')
class Fields(object):
@ -79,44 +101,83 @@ class Fields(object):
if stack:
stack[-1].contents.append(elem)
# Parse hyperlink fields
self.hyperlink_fields = []
for field in self.fields:
if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
hl = parse_hyperlink(field.instructions[0][1], log)
if hl:
if 'target' in hl and hl['target'] is None:
hl['target'] = '_blank'
all_runs = []
current_runs = []
# We only handle spans in a single paragraph
# being wrapped in <a>
for x in field.contents:
if x.tag.endswith('}p'):
if current_runs:
all_runs.append(current_runs)
current_runs = []
elif x.tag.endswith('}r'):
current_runs.append(x)
if current_runs:
all_runs.append(current_runs)
for runs in all_runs:
self.hyperlink_fields.append((hl, runs))
field_types = ('hyperlink', 'xe', 'index')
parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
def test_parse_hyperlink():
for f in field_types:
setattr(self, '%s_fields' % f, [])
for field in self.fields:
if field.instructions:
name = field.instructions[0][0]
func = parsers.get(name, None)
if func is not None:
func(field, field_parsers[name], log)
def parse_hyperlink(self, field, parse_func, log):
# Parse hyperlink fields
if len(field.instructions) == 1:
hl = parse_func(field.instructions[0][1], log)
if hl:
if 'target' in hl and hl['target'] is None:
hl['target'] = '_blank'
all_runs = []
current_runs = []
# We only handle spans in a single paragraph
# being wrapped in <a>
for x in field.contents:
if x.tag.endswith('}p'):
if current_runs:
all_runs.append(current_runs)
current_runs = []
elif x.tag.endswith('}r'):
current_runs.append(x)
if current_runs:
all_runs.append(current_runs)
for runs in all_runs:
self.hyperlink_fields.append((hl, runs))
def parse_xe(self, field, parse_func, log):
# Parse XE fields
xe = parse_func(field.instructions[0][1], log) # TODO: Handle field with multiple instructions
if xe:
# TODO: parse the field contents
self.xe_fields.append(xe)
def parse_index(self, field, parse_func, log):
# Parse Index fields
if len(field.instructions):
idx = parse_func(field.instructions[0][1], log)
# TODO: parse the field contents
self.index_fields.append(idx)
def test_parse_fields():
import unittest
class TestParseHyperLink(unittest.TestCase):
class TestParseFields(unittest.TestCase):
def test_parsing(self):
self.assertEqual(parse_hyperlink(
r'\l anchor1', None), {'anchor':'anchor1'})
self.assertEqual(parse_hyperlink(
r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'})
self.assertEqual(parse_hyperlink(
r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
self.assertEqual(parse_hyperlink(
r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'})
def test_hyperlink(self):
ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
ae(r'\l anchor1', {'anchor':'anchor1'})
ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
ae(r'xxxx \y yyyy', {'url': 'xxxx'})
suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink)
def test_xe(self):
ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
ae(r'"some name"', {'text':'some name'})
ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
def test_index(self):
ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
ae(r'', {})
ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
unittest.TextTestRunner(verbosity=4).run(suite)
if __name__ == '__main__':
test_parse_fields()

View File

@ -0,0 +1,421 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from lxml.html.builder import A, SPAN
import lxml.etree
from calibre.ebooks.docx.names import XPath, ancestor, namespaces
NBSP = '\xa0'
class Location(object):
r"""
This class represents one location in the index.
We should provide a way to mark the main entries. Libre office
has a main attribute, which doesn't seem to map to docx, and at least
some versions of word can mark entries bold or italic with \b and \i.
One index entry corresponds to a list of locations where the entry
is referenced in the text.
"""
def __init__(self, bookmark, target):
self.bookmark = bookmark
self.target = target
class Entry(object):
"""
This class represents one index entry.
We can also have a list of sub-entries for the primary/secondary
topic situation.
Each entry has a list of locations we want to point to, but
it could be empty if this is only here to organize sub-entries.
"""
def __init__(self, name, index):
self.subentries = {}
self.locations = []
self.name = name
self.index = index
def add_entry(self, entry, sub):
"""
The entry has the form [xxx, field, bookmark, target]
"""
if len(sub) == 0:
self.locations.append(Location(entry[2], entry[3]))
else:
sube = find_entry(sub[0], self.subentries, self.index)
sube.add_entry(entry, sub[1:])
def make_link(self, loc, amap):
# As a first pass, we just put a placeholder in the target location
# We want it to float right
markid = amap[loc.bookmark]
if markid is None:
return
span = A()
span.set('style', 'float:right')
span.set('href', '#' + markid)
from calibre.ebooks.docx.to_html import Text
text = Text(span, 'text', [])
text.buf.append(loc.target)
setattr(text.elem, text.attr, ''.join(text.buf))
return span
def to_htmlunit(self, body, level, amap):
"""
Append the material for one index entry to the document.
There is a name, and 0 or more locations.
Put the first location, if any, on the same line as the
name, and others on following lines.
"""
style = self.index.entry_styles[level]
main = add_name(self.name, style)
if len(self.locations) == 0:
body.append(main)
return
# First link on same line as name
link = self.make_link(self.locations[0], amap)
main.append(link)
body.append(main)
# Put other links for same entry on their own lines
# To keep the link span separate need to put a space as the name
for l in self.locations[1:]:
link = self.make_link(l, amap)
dest = P()
dest.set('class', style)
dest.text = NBSP
dest.append(link)
body.append(dest)
def to_html(self, body, level, amap):
level = min(level, 2)
self.to_htmlunit(body, level, amap)
for key in sorted(self.subentries.keys()):
self.subentries[key].to_html(body, level + 1, amap)
class Section(object):
"""
This class represents one section of the index - usually,
for example, the A's or the B's.
It is primarily a dictionary of entries.
"""
def __init__(self, index):
self.index = index
self.entries = {}
def add_entry(self, entry):
"""
We have information from one index marker.
The entry has form [name, field, bookmark, target].
The name is something like A or A:B and so on.
If we already have an entry for that name, just add the new
location to it; otherwise create a new entry.
"""
topics = entry[0].strip('"').split(':')
targ = find_entry(topics[0], self.entries, self.index)
targ.add_entry(entry, topics[1:])
def to_html(self, key, body, amap):
"""
Add one section of the index to the html
"""
if len(key) > 0:
body.append(add_name(key, self.index.section_style))
for ekey in sorted(self.entries.keys()):
self.entries[ekey].to_html(body, 0, amap)
class Index(object):
"""
This class generates an alphabetical index from the index markers in a docx file.
Each field in the parse of the docx file contains an instructions list.
Instructions with name XE are index instructions.
The instruction also contains the entry specifier, of the form A[:B[:C]] for
main entry, A, subentry B, and so on.
The index object is a dictionary of sections, 'A' mapping to a section
object with all the A entries, and so on. Each section in turn is a dictionary
mapping an index specifier, like A:B, to a list of locations where that
entry is referenced.
We could make the formatting more configurable.
Currently it uses fixed styles for the various elements, and a section
heading for each letter.
"""
def __init__(self, convert):
"""
Convert the index markers in the document into an index object.
"""
self.convert = convert
self.sections = {}
self.gen_styles()
# Get a list of [name, field] entries, where name is the index
# entry and field is the indexed location
self.entries = self.get_entries()
# Find styles which are provide the text for links.
self.target_styles()
# Generate bookmarks in the document at the indexed locations
self.bookmarks()
# Set up the entries in index sections
for unit in self.entries:
sec = self.find_section(unit[0])
sec.add_entry(unit)
def get_entries(self):
r"""
We already have a list of fields which includes the index marks,
identified by an XE tag.
In the base case, the field object includes an instruction list
with one tuple like ('XE', '"entry"'), where entry is the text we
want to put in the index. Note the double quotes around the entry.
Sometimes the entry is broken up in the document, for example if
there are spelling issues in the entry text.
In this case, for reasons I don't understand, the instruction
list includes a number of tuples, and we get the actual entry
text by concatenating all of them after the initial tag.
There can be formatting information in the instructions also, after
the double quoted part, like '"entry" \b'.
So, we want to concatenate all parts after the initial tag, and
then get the part in double quotes.
"""
fields = self.convert.fields.fields
def get_entry(field):
elist = [field.instructions[0][1]]
for inst in field.instructions[1:]:
elist.append(inst[0])
elist.append(inst[1])
entry = ''.join(elist)
sep1 = entry.partition('"')
if sep1[2] == '':
return entry
sep2 = sep1[2].partition('"')
return sep2[0]
# Only want the index entries
return [[get_entry(f), f] for f in fields
if f.instructions and f.instructions[0][0] == 'XE']
def target_styles(self):
"""
We want to get a list of styles which represent valid index targets.
That is, the text of a link in the index will be the title of the
section of the document containing the indexed location.
We want the list of styles which can provide a valid title.
In practice, this maps to Heading1 through Heading3 in the original document.
Calibre apparently preprocesses docx files, so that a paragraph in
the original with style Heading1 will now have a different, internal style.
In this version we use convert.styles.id_map to find style ids
with internal names beginning Heading; but I'd feel better if we
jumped in earlier and could map it to the original docx styles.
"""
smap = self.convert.styles.id_map
self.targstyles = [name for name, style in smap.iteritems() if style.name.lower().startswith('heading')]
def is_heading(self, node):
"""
Return true if the input node is a valid index link target.
"""
snodes = XPath("./w:pPr/w:pStyle")(node)
if len(snodes) == 0:
return False
sn = snodes[0]
# The key includes the long namespace information
k = [key for key in sn.keys() if key.endswith('}val')]
if len(k) == 0:
return False
style = sn.get(k[0])
return style in self.targstyles
def get_headings(self, node):
"""
Get a list of all children of the input node which are headings -
that is, valid targets for an index link
"""
answer = []
for c in node.getchildren():
if self.is_heading(c):
answer.append(c)
return answer
def text_value(self, node):
tnodes = XPath("./w:r/w:t")(node)
if len(tnodes) == 0:
return 'Link'
return ''.join((x.text or '') for x in tnodes)
def find_target(self, node):
"""
Given an index entry, find the text of the last heading section
preceding the entry.
To do this, find the containing w:p element. If it is a heading,
return the text.
Otherwise, go up the document level by level, staring with the
parent of the w:p element containing the entry.
At each level, get the list of heading w:p elements which are
children of the top node. We also have the index in the top node
of the child node containing the entry.
Find the largest index of a heading child which is < the entry
index, if any - that is the heading we want.
Perhaps we should precalculate some of this.
We could also consider doing some of this in xpath, but the style
attributes have been modified, so we can't just look for the
original names.
"""
pnode = ancestor(node, 'w:p')
if self.is_heading(pnode):
return self.text_value(pnode)
while True:
parent = pnode.getparent()
if parent is None:
return 'Link'
# Maintain document order in these lists
pindex = parent.index(pnode)
hlist = self.get_headings(parent)
hlist = filter(lambda x: parent.index(x) < pindex, hlist)
if len(hlist) > 0:
return self.text_value(hlist[-1])
# Try again
pnode = parent
def bookmarks(self):
"""
For each index entry we need to insert a bookmark at the target location.
These bookmarks are for our internal use - I'm not sure they would work well
in the original docx document.
For each entry we have the Field object, which includes the instrText
element of the document.
Try going to the parent, and inserting a bookmark start just before it.
"""
bmno = 0
for entry in self.entries:
for instnode in entry[1].elements:
name = 'indexBookmark' + str(bmno)
bmno += 1
tag = "{%s}bookmarkStart" % namespaces['w']
att = "{%s}name" % namespaces['w']
bookmark = lxml.etree.Element(tag)
bookmark.set(att, name)
rnode = instnode.getparent()
# Add the name so that we can link to it
entry.append(name)
# insert the bookmark before rnode
rparent = rnode.getparent()
rind = rparent.index(rnode)
rparent.insert(rind, bookmark)
# We want the index entry to be the content of the closest
# preceding Heading paragraph.
# We should make the targets configurable, and add chapter
# titles and maybe other things.
# What about numbering?
targnode = self.find_target(rnode)
entry.append(targnode)
def gen_styles(self):
"""
Generate css styles for the index elements.
We do title, section header, and three levels of entries.
These are reasonable styles which only set a couple of key
values, but we could provide an interface to allow the user to set them.
Is there any problem registering the styles this early in the
conversion process?
"""
# The result is a string we can use as a class name.
css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
self.title_style = self.convert.styles.register(css, 'block')
css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
self.section_style = self.convert.styles.register(css, 'block')
self.entry_styles = []
for i in range(3):
indent = str(i*20) + 'pt'
css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
self.entry_styles.append(self.convert.styles.register(css, 'block'))
def find_section(self, tag):
"""
Find the section for this index entry, creating it if required.
The tag has a form like A or A:B or etc.
If you want a single index without section divisions, you can
just return the single section here every time.
"""
shead = tag[0]
# Make it lower case, and group all non-alphabetic things together
if shead.isalpha():
shead = shead.lower()
else:
shead = ''
if shead in self.sections:
return self.sections[shead]
sect = Section(self)
self.sections[shead] = sect
return sect
def generate(self):
"""
We generated the index object in the constructor.
This method writes it into the html.
"""
# TODO: Only do this at locations of the INDEX field in the document
body = self.convert.body
body.append(add_name('Index', self.title_style))
# And write them to the html
for key in sorted(self.sections.keys()):
self.sections[key].to_html(key, body, self.convert.anchor_map)
def add_name(str, clname):
# Put this into the convert document map?
dest = P()
dest.set('class', clname)
span = SPAN()
from calibre.ebooks.docx.to_html import Text
text = Text(span, 'text', [])
text.buf.append(str)
setattr(text.elem, text.attr, ''.join(text.buf))
dest.append(span)
return dest
def find_entry(value, dict, index):
"""
Find the Entry in the dictionary, or create a new one.
We convert to lower case to group all capitalizations
together as a single entry.
"""
lvalue = value.lower()
if lvalue in dict:
return dict[lvalue]
ent = Entry(value, index)
dict[lvalue] = ent
return ent

View File

@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
# from calibre.ebooks.docx.index import Index
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -97,6 +98,12 @@ class Convert(object):
paras = []
self.log.debug('Converting Word markup to HTML')
# If we are doing an index, do the body part of the processing here.
# We need to insert bookmarks at the indexed locations before the
# main conversion work.
# index = Index(self)
self.read_page_properties(doc)
self.current_rels = relationships_by_id
for wp, page_properties in self.page_map.iteritems():
@ -105,6 +112,7 @@ class Convert(object):
p = self.convert_p(wp)
self.body.append(p)
paras.append(wp)
self.read_block_anchors(doc)
self.styles.apply_contextual_spacing(paras)
# Apply page breaks at the start of every section, except the first
@ -161,6 +169,9 @@ class Convert(object):
self.resolve_links()
# For an index, we now want to append the index object
# index.generate()
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map, self.page_map)