diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 9e0e3d2a8c..2ffa2b84b1 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -8,6 +8,7 @@ __copyright__ = '2013, Kovid Goyal ' import re, os +from calibre.ebooks.docx.index import process_index from calibre.ebooks.docx.names import XPath, get, namespaces TEST_INDEX = 'CALIBRE_TEST_INDEX' in os.environ @@ -82,7 +83,7 @@ parse_xe = parser('xe', parse_index = parser('index', 'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator' ' f:entry-type g:page-range-separator h:heading k:crossref-separator' - ' p:page-number-separator r:run-together y:yomi z:langcode') + ' l:page-number-separator p:letter-range s:sequence-name r:run-together y:yomi z:langcode') class Fields(object): @@ -173,15 +174,20 @@ class Fields(object): bm = p.makeelement(WORD('bookmarkEnd')) bm.set(WORD('id'), bmark) p.insert(p.index(field.end) + 1, bm) + xe['start_elem'] = field.start self.xe_fields.append(xe) def parse_index(self, field, parse_func, log): - # Parse Index fields if not TEST_INDEX: return + if not field.contents: + return idx = parse_func(field.instructions, log) - # TODO: parse the field contents - self.index_fields.append(idx) + hyperlinks, blocks = process_index(field, idx, self.xe_fields, log) + for anchor, run in hyperlinks: + self.hyperlink_fields.append(({'anchor':anchor}, [run])) + + self.index_fields.append((idx, blocks)) def test_parse_fields(): import unittest diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py index 3e7e9dc42f..bf916252b6 100644 --- a/src/calibre/ebooks/docx/index.py +++ b/src/calibre/ebooks/docx/index.py @@ -6,416 +6,115 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -from collections import OrderedDict +from operator import itemgetter -from lxml.html.builder import A, SPAN -import lxml.etree +from calibre.ebooks.docx.names import XPath, expand +from calibre.utils.icu import partition_by_first_letter, sort_key -from calibre.ebooks.docx.names import XPath, ancestor, namespaces +def get_applicable_xe_fields(index, xe_fields): + iet = index.get('entry-type', None) + xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet] + lr = index.get('letter-range', None) + if lr is not None: + sl, el = lr.parition('-')[0::2] + sl, el = sl.strip(), el.strip() + if sl and el: + def inrange(text): + return sl <= text[0] <= el + xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))] -NBSP = '\xa0' + bmark = index.get('bookmark', None) + if bmark is None: + return xe_fields + attr = expand('w:name') + bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark} + ancestors = XPath('ancestor::w:bookmarkStart') -class Location(object): - r""" - This class represents one location in the index. - We should provide a way to mark the main entries. Libre office - has a main attribute, which doesn't seem to map to docx, and at least - some versions of word can mark entries bold or italic with \b and \i. - One index entry corresponds to a list of locations where the entry - is referenced in the text. - """ + def contained(xe): + # Check if the xe field is contained inside a bookmark with the + # specified name + return bool(set(ancestors(xe['start_elem'])) & bookmarks) - def __init__(self, bookmark, target): - self.bookmark = bookmark - self.target = target + return [xe for xe in xe_fields if contained(xe)] -class Entry(object): - """ - This class represents one index entry. - We can also have a list of sub-entries for the primary/secondary - topic situation. - Each entry has a list of locations we want to point to, but - it could be empty if this is only here to organize sub-entries. - """ +def make_block(style, parent, pos): + p = parent.makeelement(expand('w:p')) + parent.insert(pos, p) + if style is not None: + ppr = p.makeelement(expand('w:pPr')) + p.append(ppr) + ps = ppr.makeelement(expand('w:pStyle')) + ppr.append(ps) + ps.set(expand('w:val'), style) + r = p.makeelement(expand('w:r')) + p.append(r) + t = r.makeelement(expand('w:t')) + t.set(expand('xml:space'), 'preserve') + r.append(t) + return p, t - def __init__(self, name, index): - self.subentries = {} - self.locations = [] - self.name = name - self.index = index +def add_xe(xe, t): + text = xe.get('text', '') + pt = xe.get('page-number-text', None) + t.text = text or ' ' + if pt: + p = t.getparent().getparent() + r = p.makeelement(expand('w:r')) + p.append(r) + t2 = r.makeelement(expand('w:t')) + t2.set(expand('xml:space'), 'preserve') + t2.text = ' [%s]' % pt + r.append(t2) + return xe['anchor'], t.getparent() - def add_entry(self, entry, sub): - """ - The entry has the form [xxx, field, bookmark, target] - """ - if len(sub) == 0: - self.locations.append(Location(entry[2], entry[3])) +def process_index(field, index, xe_fields, log): + ''' + We remove all the word generated index markup and replace it with our own + that is more suitable for an ebook. + ''' + styles = [] + heading_text = index.get('heading', None) + heading_style = 'IndexHeading' + start_pos = None + for elem in field.contents: + if elem.tag.endswith('}p'): + s = XPath('descendant::pStyle/@w:val')(elem) + if s: + styles.append(s[0]) + p = elem.getparent() + if start_pos is None: + start_pos = (p, p.index(elem)) + p.remove(elem) + + xe_fields = get_applicable_xe_fields(index, xe_fields) + if not xe_fields: + return + if heading_text is not None: + groups = partition_by_first_letter(xe_fields, key=itemgetter('text')) + items = [] + for key, fields in groups.iteritems(): + items.append(key), items.extend(fields) + if styles: + heading_style = styles[0] + else: + items = sorted(xe_fields, key=lambda x:sort_key(x['text'])) + + hyperlinks = [] + blocks = [] + for item in reversed(items): + is_heading = not isinstance(item, dict) + style = heading_style if is_heading else None + p, t = make_block(style, *start_pos) + if is_heading: + text = heading_text + if text.lower().startswith('a'): + text = item + text[1:] + t.text = text else: - sube = find_entry(sub[0], self.subentries, self.index) - sube.add_entry(entry, sub[1:]) + hyperlinks.append(add_xe(item, t)) + blocks.append(p) - def make_link(self, loc, amap): - # As a first pass, we just put a placeholder in the target location - # We want it to float right - markid = amap[loc.bookmark] - if markid is None: - return + return hyperlinks, blocks - span = A() - span.set('style', 'float:right') - span.set('href', '#' + markid) - from calibre.ebooks.docx.to_html import Text - text = Text(span, 'text', []) - text.buf.append(loc.target) - setattr(text.elem, text.attr, ''.join(text.buf)) - return span - def to_htmlunit(self, body, level, amap): - """ - Append the material for one index entry to the document. - There is a name, and 0 or more locations. - Put the first location, if any, on the same line as the - name, and others on following lines. - """ - style = self.index.entry_styles[level] - main = add_name(self.name, style) - if len(self.locations) == 0: - body.append(main) - return - - # First link on same line as name - link = self.make_link(self.locations[0], amap) - main.append(link) - body.append(main) - - # Put other links for same entry on their own lines - # To keep the link span separate need to put a space as the name - for l in self.locations[1:]: - link = self.make_link(l, amap) - dest = P() - dest.set('class', style) - dest.text = NBSP - dest.append(link) - body.append(dest) - - def to_html(self, body, level, amap): - level = min(level, 2) - self.to_htmlunit(body, level, amap) - for key in sorted(self.subentries.keys()): - self.subentries[key].to_html(body, level + 1, amap) - -class Section(object): - """ - This class represents one section of the index - usually, - for example, the A's or the B's. - It is primarily a dictionary of entries. - """ - - def __init__(self, index): - self.index = index - self.entries = {} - - def add_entry(self, entry): - """ - We have information from one index marker. - The entry has form [name, field, bookmark, target]. - The name is something like A or A:B and so on. - If we already have an entry for that name, just add the new - location to it; otherwise create a new entry. - """ - topics = entry[0].strip('"').split(':') - targ = find_entry(topics[0], self.entries, self.index) - targ.add_entry(entry, topics[1:]) - - def to_html(self, key, body, amap): - """ - Add one section of the index to the html - """ - if len(key) > 0: - body.append(add_name(key, self.index.section_style)) - for ekey in sorted(self.entries.keys()): - self.entries[ekey].to_html(body, 0, amap) - -class Index(object): - """ - This class generates an alphabetical index from the index markers in a docx file. - - Each field in the parse of the docx file contains an instructions list. - Instructions with name XE are index instructions. - The instruction also contains the entry specifier, of the form A[:B[:C]] for - main entry, A, subentry B, and so on. - - The index object is a dictionary of sections, 'A' mapping to a section - object with all the A entries, and so on. Each section in turn is a dictionary - mapping an index specifier, like A:B, to a list of locations where that - entry is referenced. - - We could make the formatting more configurable. - Currently it uses fixed styles for the various elements, and a section - heading for each letter. - """ - - def __init__(self, convert): - """ - Convert the index markers in the document into an index object. - """ - self.convert = convert - self.sections = {} - - self.gen_styles() - - # Get a list of [name, field] entries, where name is the index - # entry and field is the indexed location - self.entries = self.get_entries() - - # Find styles which are provide the text for links. - self.target_styles() - - # Generate bookmarks in the document at the indexed locations - self.bookmarks() - - # Set up the entries in index sections - for unit in self.entries: - sec = self.find_section(unit[0]) - sec.add_entry(unit) - - def get_entries(self): - r""" - We already have a list of fields which includes the index marks, - identified by an XE tag. - In the base case, the field object includes an instruction list - with one tuple like ('XE', '"entry"'), where entry is the text we - want to put in the index. Note the double quotes around the entry. - Sometimes the entry is broken up in the document, for example if - there are spelling issues in the entry text. - In this case, for reasons I don't understand, the instruction - list includes a number of tuples, and we get the actual entry - text by concatenating all of them after the initial tag. - There can be formatting information in the instructions also, after - the double quoted part, like '"entry" \b'. - So, we want to concatenate all parts after the initial tag, and - then get the part in double quotes. - """ - fields = self.convert.fields.fields - - def get_entry(field): - elist = [field.instructions[0][1]] - for inst in field.instructions[1:]: - elist.append(inst[0]) - elist.append(inst[1]) - - entry = ''.join(elist) - sep1 = entry.partition('"') - if sep1[2] == '': - return entry - sep2 = sep1[2].partition('"') - return sep2[0] - - # Only want the index entries - return [[get_entry(f), f] for f in fields - if f.instructions and f.instructions[0][0] == 'XE'] - - def target_styles(self): - """ - We want to get a list of styles which represent valid index targets. - That is, the text of a link in the index will be the title of the - section of the document containing the indexed location. - We want the list of styles which can provide a valid title. - In practice, this maps to Heading1 through Heading3 in the original document. - Calibre apparently preprocesses docx files, so that a paragraph in - the original with style Heading1 will now have a different, internal style. - In this version we use convert.styles.id_map to find style ids - with internal names beginning Heading; but I'd feel better if we - jumped in earlier and could map it to the original docx styles. - """ - smap = self.convert.styles.id_map - self.targstyles = [name for name, style in smap.iteritems() if style.name.lower().startswith('heading')] - - def is_heading(self, node): - """ - Return true if the input node is a valid index link target. - """ - snodes = XPath("./w:pPr/w:pStyle")(node) - if len(snodes) == 0: - return False - - sn = snodes[0] - - # The key includes the long namespace information - k = [key for key in sn.keys() if key.endswith('}val')] - if len(k) == 0: - return False - style = sn.get(k[0]) - return style in self.targstyles - - def get_headings(self, node): - """ - Get a list of all children of the input node which are headings - - that is, valid targets for an index link - """ - answer = [] - for c in node.getchildren(): - if self.is_heading(c): - answer.append(c) - return answer - - def text_value(self, node): - tnodes = XPath("./w:r/w:t")(node) - if len(tnodes) == 0: - return 'Link' - return ''.join((x.text or '') for x in tnodes) - - def find_target(self, node): - """ - Given an index entry, find the text of the last heading section - preceding the entry. - To do this, find the containing w:p element. If it is a heading, - return the text. - Otherwise, go up the document level by level, staring with the - parent of the w:p element containing the entry. - At each level, get the list of heading w:p elements which are - children of the top node. We also have the index in the top node - of the child node containing the entry. - Find the largest index of a heading child which is < the entry - index, if any - that is the heading we want. - Perhaps we should precalculate some of this. - We could also consider doing some of this in xpath, but the style - attributes have been modified, so we can't just look for the - original names. - """ - pnode = ancestor(node, 'w:p') - if self.is_heading(pnode): - return self.text_value(pnode) - - while True: - parent = pnode.getparent() - if parent is None: - return 'Link' - - # Maintain document order in these lists - pindex = parent.index(pnode) - hlist = self.get_headings(parent) - hlist = filter(lambda x: parent.index(x) < pindex, hlist) - if len(hlist) > 0: - return self.text_value(hlist[-1]) - - # Try again - pnode = parent - - def bookmarks(self): - """ - For each index entry we need to insert a bookmark at the target location. - These bookmarks are for our internal use - I'm not sure they would work well - in the original docx document. - For each entry we have the Field object, which includes the instrText - element of the document. - Try going to the parent, and inserting a bookmark start just before it. - """ - bmno = 0 - for entry in self.entries: - for instnode in entry[1].elements: - name = 'indexBookmark' + str(bmno) - bmno += 1 - tag = "{%s}bookmarkStart" % namespaces['w'] - att = "{%s}name" % namespaces['w'] - bookmark = lxml.etree.Element(tag) - bookmark.set(att, name) - rnode = instnode.getparent() - - # Add the name so that we can link to it - entry.append(name) - - # insert the bookmark before rnode - rparent = rnode.getparent() - rind = rparent.index(rnode) - rparent.insert(rind, bookmark) - - # We want the index entry to be the content of the closest - # preceding Heading paragraph. - # We should make the targets configurable, and add chapter - # titles and maybe other things. - # What about numbering? - targnode = self.find_target(rnode) - entry.append(targnode) - - def gen_styles(self): - """ - Generate css styles for the index elements. - We do title, section header, and three levels of entries. - These are reasonable styles which only set a couple of key - values, but we could provide an interface to allow the user to set them. - Is there any problem registering the styles this early in the - conversion process? - """ - # The result is a string we can use as a class name. - css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')]) - self.title_style = self.convert.styles.register(css, 'block') - - css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')]) - self.section_style = self.convert.styles.register(css, 'block') - - self.entry_styles = [] - for i in range(3): - indent = str(i*20) + 'pt' - css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)]) - self.entry_styles.append(self.convert.styles.register(css, 'block')) - - def find_section(self, tag): - """ - Find the section for this index entry, creating it if required. - The tag has a form like A or A:B or etc. - If you want a single index without section divisions, you can - just return the single section here every time. - """ - shead = tag[0] - - # Make it lower case, and group all non-alphabetic things together - if shead.isalpha(): - shead = shead.lower() - else: - shead = '' - - if shead in self.sections: - return self.sections[shead] - sect = Section(self) - self.sections[shead] = sect - return sect - - def generate(self): - """ - We generated the index object in the constructor. - This method writes it into the html. - """ - # TODO: Only do this at locations of the INDEX field in the document - body = self.convert.body - body.append(add_name('Index', self.title_style)) - - # And write them to the html - for key in sorted(self.sections.keys()): - self.sections[key].to_html(key, body, self.convert.anchor_map) - -def add_name(str, clname): - # Put this into the convert document map? - dest = P() - dest.set('class', clname) - span = SPAN() - from calibre.ebooks.docx.to_html import Text - text = Text(span, 'text', []) - text.buf.append(str) - setattr(text.elem, text.attr, ''.join(text.buf)) - dest.append(span) - return dest - -def find_entry(value, dict, index): - """ - Find the Entry in the dictionary, or create a new one. - We convert to lower case to group all capitalizations - together as a single entry. - """ - lvalue = value.lower() - if lvalue in dict: - return dict[lvalue] - ent = Entry(value, index) - dict[lvalue] = ent - return ent diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 5fe7ddcbf1..678b770b02 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -26,7 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes from calibre.ebooks.docx.cleanup import cleanup_markup from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.toc import create_toc -from calibre.ebooks.docx.fields import Fields, TEST_INDEX +from calibre.ebooks.docx.fields import Fields from calibre.ebooks.docx.settings import Settings from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 @@ -98,10 +98,6 @@ class Convert(object): self.log.debug('Converting Word markup to HTML') - if TEST_INDEX: - from calibre.ebooks.docx.index import Index - self.index = Index(self.fields) - self.read_page_properties(doc) self.current_rels = relationships_by_id for wp, page_properties in self.page_map.iteritems(): @@ -167,9 +163,6 @@ class Convert(object): self.resolve_links() - if TEST_INDEX: - self.index.generate() - self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map)