From f790038819c45614eb0ebff141600da28f63d28d Mon Sep 17 00:00:00 2001 From: Peter Garst Date: Tue, 25 Mar 2014 15:26:20 -0700 Subject: [PATCH 1/9] This material adds the option to generate an alphabetical index from index markers in a docx file. --- .../ebooks/conversion/plugins/docx_input.py | 5 +- src/calibre/ebooks/docx/container.py | 3 +- src/calibre/ebooks/docx/fields.py | 4 + src/calibre/ebooks/docx/index.py | 427 ++++++++++++++++++ src/calibre/ebooks/docx/to_html.py | 17 +- src/calibre/gui2/convert/docx_input.py | 2 +- src/calibre/gui2/convert/docx_input.ui | 7 + 7 files changed, 461 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/docx/index.py diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py index 30a4bb3868..fe987ea28f 100644 --- a/src/calibre/ebooks/conversion/plugins/docx_input.py +++ b/src/calibre/ebooks/conversion/plugins/docx_input.py @@ -19,6 +19,9 @@ class DOCXInput(InputFormatPlugin): help=_('Normally, if a large image is present at the start of the document that looks like a cover, ' 'it will be removed from the document and used as the cover for created ebook. This option ' 'turns off that behavior.')), + OptionRecommendation(name='docx_index', recommended_value=False, + help=_('If there are embedded index markers in the document, this option will use them to create ' + 'an alphabetical index with links to the locations of the markers.')), } @@ -26,5 +29,5 @@ class DOCXInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.docx.to_html import Convert - return Convert(stream, detect_cover=not options.docx_no_cover, log=log)() + return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)() diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index deaf5bd4d0..82b3fc9f4b 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -22,7 +22,8 @@ from calibre.utils.zipfile import ZipFile from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER def fromstring(raw, parser=RECOVER_PARSER): - return etree.fromstring(raw, parser=parser) + res = etree.fromstring(raw, parser=parser) + return res # Read metadata {{{ def read_doc_props(raw, mi): diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 6617728f0c..189927a6c6 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -10,12 +10,15 @@ import re from calibre.ebooks.docx.names import XPath, get +import sys + class Field(object): def __init__(self, start): self.start = start self.end = None self.contents = [] + self.elements = [] self.instructions = [] def add_instr(self, elem): @@ -24,6 +27,7 @@ class Field(object): return name, rest = raw.strip().partition(' ')[0::2] self.instructions.append((name, rest.strip())) + self.elements.append(elem) WORD, FLAG = 0, 1 scanner = re.Scanner([ diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py new file mode 100644 index 0000000000..23865210a0 --- /dev/null +++ b/src/calibre/ebooks/docx/index.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +import itertools +from collections import OrderedDict +from lxml import html +from lxml.html.builder import ( + HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1) +from calibre.ebooks.docx.names import ( + XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, + ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS) + +import lxml.etree + +NBSP = '\xa0' + +class Location: + """ + This class represents one location in the index. + We should provide a way to mark the main entries. Libre office + has a main attribute, which doesn't seem to map to docx, and at least + some versions of word can mark entries bold or italic with \b and \i. + One index entry corresponds to a list of locations where the entry + is referenced in the text. + """ + + def __init__(self, bookmark, target): + self.bookmark = bookmark + self.target = target + +class Entry: + """ + This class represents one index entry. + We can also have a list of subentries for the primary/secondary + topic situation. + Each entry has a list of locations we want to point to, but + it could be empty if this is only here to organize subentries. + """ + + def __init__(self, name, index): + self.subentries = {} + self.locations = [] + self.name = name + self.index = index + + def addEntry(self, entry, sub): + """ + The entry has the form [xxx, field, bookmark, target] + """ + if len(sub) == 0: + self.locations.append(Location(entry[2], entry[3])) + else: + sube = Index.findEntry(sub[0], self.subentries, self.index) + sube.addEntry(entry, sub[1:]) + + def makeLink(self, loc, amap): + # As a first pass, we just put a placeholder in the target location + # We want it to float right + markid = amap[loc.bookmark] + if markid == None: + return + + span = A() + span.set('style', 'float:right') + span.set('href', '#' + markid) + from calibre.ebooks.docx.to_html import Text + text = Text(span, 'text', []) + text.buf.append(loc.target) + setattr(text.elem, text.attr, ''.join(text.buf)) + return span + + def toHtmlUnit(self, body, level, amap): + """ + Append the material for one index entry to the document. + There is a name, and 0 or more locations. + Put the first location, if any, on the same line as the + name, and others on following lines. + """ + style = self.index.entryStyles[level] + main = Index.addName(self.name, style) + if len(self.locations) == 0: + body.append(main) + return + + # First link on same line as name + link = self.makeLink(self.locations[0], amap) + main.append(link) + body.append(main) + + # Put other links for same entry on their own lines + # To keep the link span separate need to put a space as the name + for l in self.locations[1:]: + link = self.makeLink(l, amap) + dest = P() + dest.set('class', style) + dest.text = NBSP + dest.append(link) + body.append(dest) + + def toHtml(self, body, level, amap): + level = min(level, 2) + self.toHtmlUnit(body, level, amap) + for key in sorted(self.subentries.keys()): + self.subentries[key].toHtml(body, level + 1, amap) + +class Section: + """ + This class represents one section of the index - usually, + for example, the A's or the B's. + It is primarily a dictionary of entries. + """ + + def __init__(self, index): + self.index = index + self.entries = {} + + def addEntry(self, entry): + """ + We have information from one index marker. + The entry has form [name, field, bookmark, target]. + The name is something like A or A:B and so on. + If we already have an entry for that name, just add the new + location to it; otherwise create a new entry. + """ + topics = entry[0].strip('"').split(':') + targ = Index.findEntry(topics[0], self.entries, self.index) + targ.addEntry(entry, topics[1:]) + + def toHtml(self, key, body, amap): + """ + Add one section of the index to the html + """ + if len(key) > 0: + body.append(Index.addName(key, self.index.sectionStyle)) + for ekey in sorted(self.entries.keys()): + self.entries[ekey].toHtml(body, 0, amap) + +class Index: + """ + This class generates an alphabetical index from the index markers in a docx file. + + Each field in the parse of the docx file contains an instructions list. + Instructions with name XE are index instructions. + The instruction also contains the entry specifier, of the form A[:B[:C]] for + main entry, A, subentry B, and so on. + + The index object is a dictionary of sections, 'A' mapping to a section + object with all the A entries, and so on. Each section in turn is a dictionary + mapping an index specifier, like A:B, to a list of locations where that + entry is referenced. + + We could make the formatting more configurable. + Currently it uses fixed styles for the various elements, and a section + heading for each letter. + """ + + def __init__(self, convert): + """ + Convert the index markers in the document into an index object. + """ + self.convert = convert + self.sections = {} + + self.genStyles() + + # Get a list of [name, field] entries, where name is the index + # entry and field is the indexed location + self.entries = self.getEntries() + + # Find styles which are provide the text for links. + self.targetStyles() + + # Generate bookmarks in the document at the indexed locations + self.bookmarks() + + # Set up the entries in index sections + for unit in self.entries: + sec = self.findSection(unit[0]) + sec.addEntry(unit) + + def getEntries(self): + """ + We already have a list of fields which includes the index marks, + identified by an XE tag. + In the base case, the field object includes an instruction list + with one tuple like ('XE', '"entry"'), where entry is the text we + want to put in the index. Note the double quotes around the entry. + Sometimes the entry is broken up in the document, for example if + there are spelling issues in the entry text. + In this case, for reasons I don't understand, the instruction + list includes a number of tuples, and we get the actual entry + text by concatenating all of them after the initial tag. + There can be formatting information in the instructions also, after + the double quoted part, like '"entry" \b'. + So, we want to concatenate all parts after the initial tag, and + then get the part in double quotes. + """ + fields = self.convert.fields.fields + + # Only want the index entries + fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields) + return map(lambda f: [self.getEntry(f), f], fields) + + def getEntry(self, field): + + elist = [field.instructions[0][1]] + for inst in field.instructions[1:]: + elist.append(inst[0]) + elist.append(inst[1]) + + entry = ''.join(elist) + sep1 = entry.partition('"') + if sep1[2] == '': + return entry + sep2 = sep1[2].partition('"') + return sep2[0] + + def targetStyles(self): + """ + We want to get a list of styles which represent valid index targets. + That is, the text of a link in the index will be the title of the + section of the document containing the indexed location. + We want the list of styles which can provide a valid title. + In practice, this maps to Heading1 through Heading3 in the original document. + Calibre apparently preprocesses docx files, so that a paragraph in + the original with style Heading1 will now have a different, internal style. + In this version we use convert.styles.id_map to find style ids + with internal names beginning Heading; but I'd feel better if we + jumped in earlier and could map it to the original docx styles. + """ + smap = self.convert.styles.id_map + self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')] + + def isHeading(self, node): + """ + Return true if the input node is a valid index link target. + """ + snodes = XPath("./w:pPr/w:pStyle")(node) + if len(snodes) == 0: + return False; + + sn = snodes[0] + + # The key includes the long namespace information + k = [key for key in sn.keys() if key.endswith('}val')] + if len(k) == 0: + return False + style = sn.get(k[0]) + return style in self.targstyles + + def getHeadings(self, node): + """ + Get a list of all children of the input node which are headings - + that is, valid targets for an index link + """ + answer = [] + for c in node.getchildren(): + if self.isHeading(c): + answer.append(c) + return answer + + def textValue(self, node): + tnodes = XPath("./w:r/w:t")(node) + if len(tnodes) == 0: + return 'Link' + textl = map(lambda x: x.text, tnodes) + return ''.join(textl) + + def findTarget(self, node): + """ + Given an index entry, find the text of the last heading section + preceding the entry. + To do this, find the containing w:p element. If it is a heading, + return the text. + Otherwise, go up the document level by level, staring with the + parent of the w:p element containing the entry. + At each level, get the list of heading w:p elements which are + children of the top node. We also have the index in the top node + of the child node containing the entry. + Find the largest index of a heading child which is < the entry + index, if any - that is the heading we want. + Perhaps we should precalculate some of this. + We could also consider doing some of this in xpath, but the style + attributes have been modified, so we can't just look for the + original names. + """ + pnode = ancestor(node, 'w:p') + if self.isHeading(pnode): + return self.textValue(pnode) + + while True: + parent = pnode.getparent() + if parent == None: + return 'Link' + + # Maintain document order in these lists + pindex = parent.index(pnode) + hlist = self.getHeadings(parent) + hlist = filter(lambda x: parent.index(x) < pindex, hlist) + if len(hlist) > 0: + return self.textValue(hlist[-1]) + + # Try again + pnode = parent + + def bookmarks(self): + """ + For each index entry we need to insert a bookmark at the target location. + These bookmarks are for our internal use - I'm not sure they would work well + in the original docx document. + For each entry we have the Field object, which includes the instrText + element of the document. + Try going to the parent, and inserting a bookmark start just before it. + """ + bmno = 0 + for entry in self.entries: + for instnode in entry[1].elements: + name = 'indexBookmark' + str(bmno) + bmno += 1 + tag = "{%s}bookmarkStart" % namespaces['w'] + att = "{%s}name" % namespaces['w'] + bookmark = lxml.etree.Element(tag) + bookmark.set(att, name) + rnode = instnode.getparent() + + # Add the name so that we can link to it + entry.append(name) + + # insert the bookmark before rnode + rparent = rnode.getparent() + rind = rparent.index(rnode) + rparent.insert(rind, bookmark) + + # We want the index entry to be the content of the closest + # preceding Heading paragraph. + # We should make the targets configurable, and add chapter + # titles and maybe other things. + # What about numbering? + targnode = self.findTarget(rnode) + entry.append(targnode) + + def genStyles(self): + """ + Generate css styles for the index elements. + We do title, section header, and three levels of entries. + These are reasonable styles which only set a couple of key + values, but we could provide an interface to allow the user to set them. + Is there any problem registering the styles this early in the + conversion process? + """ + # The result is a string we can use as a class name. + css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')]) + self.titleStyle = self.convert.styles.register(css, 'block') + + css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')]) + self.sectionStyle = self.convert.styles.register(css, 'block') + + self.entryStyles = [] + for i in range(3): + indent = str(i*20) + 'pt' + css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)]) + self.entryStyles.append(self.convert.styles.register(css, 'block')) + + def findSection(self, tag): + """ + Find the section for this index entry, creating it if required. + The tag has a form like A or A:B or etc. + If you want a single index without section divisions, you can + just return the single section here every time. + """ + shead = tag[0] + + # Make it lower case, and group all non-alphabetic things together + if shead.isalpha(): + shead = shead.lower() + else: + shead = '' + + if shead in self.sections: + return self.sections[shead] + sect = Section(self) + self.sections[shead] = sect + return sect + + def generate(self): + """ + We generated the index object in the constructor. + This method writes it into the html. + """ + body = self.convert.body + body.append(Index.addName('Index', self.titleStyle)) + + # And write them to the html + for key in sorted(self.sections.keys()): + self.sections[key].toHtml(key, body, self.convert.anchor_map) + + @staticmethod + def addName(str, clname): + # Put this into the convert document map? + dest = P() + dest.set('class', clname) + span = SPAN() + from calibre.ebooks.docx.to_html import Text + text = Text(span, 'text', []) + text.buf.append(str) + setattr(text.elem, text.attr, ''.join(text.buf)) + dest.append(span) + return dest + + @staticmethod + def findEntry(value, dict, index): + """ + Find the Entry in the dictionary, or create a new one. + We convert to lower case to group all capitalizations + together as a single entry. + """ + lvalue = value.lower() + if lvalue in dict: + return dict[lvalue] + ent = Entry(value, index) + dict[lvalue] = ent + return ent diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 1b1622d219..3b0be2febf 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.toc import create_toc from calibre.ebooks.docx.fields import Fields from calibre.ebooks.docx.settings import Settings +from calibre.ebooks.docx.index import Index from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 @@ -44,12 +45,13 @@ class Text: class Convert(object): - def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): + def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None): self.docx = DOCX(path_or_stream, log=log) self.ms_pat = re.compile(r'\s{2,}') self.ws_pat = re.compile(r'[\n\r\t]') self.log = self.docx.log self.detect_cover = detect_cover + self.do_index = do_index self.notes_text = notes_text or _('Notes') self.dest_dir = dest_dir or os.getcwdu() self.mi = self.docx.metadata @@ -97,6 +99,14 @@ class Convert(object): paras = [] self.log.debug('Converting Word markup to HTML') + + # If we are doing an index, do the body part of the processing here. + # We need to insert bookmarks at the indexed locations before the + # main conversion work. + if self.do_index: + self.log.debug('Generating index') + index = Index(self) + self.read_page_properties(doc) self.current_rels = relationships_by_id for wp, page_properties in self.page_map.iteritems(): @@ -105,6 +115,7 @@ class Convert(object): p = self.convert_p(wp) self.body.append(p) paras.append(wp) + self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) # Apply page breaks at the start of every section, except the first @@ -157,6 +168,10 @@ class Convert(object): parent.text = tabs[-1].tail or '' map(parent.remove, tabs) + # For an index, we now want to append the index object + if self.do_index: + index.generate() + self.images.rid_map = orig_rid_map self.resolve_links() diff --git a/src/calibre/gui2/convert/docx_input.py b/src/calibre/gui2/convert/docx_input.py index 46234c6a36..0fbfb66634 100644 --- a/src/calibre/gui2/convert/docx_input.py +++ b/src/calibre/gui2/convert/docx_input.py @@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['docx_no_cover', ]) + ['docx_no_cover', 'docx_index', ]) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/docx_input.ui b/src/calibre/gui2/convert/docx_input.ui index 41948118dc..8f3843dd2a 100644 --- a/src/calibre/gui2/convert/docx_input.ui +++ b/src/calibre/gui2/convert/docx_input.ui @@ -21,6 +21,13 @@ + + + + Generate an alphabetical index from embedded index markers + + + From 353d6dee6ca1039b1b310b8ea8b1e6872f582540 Mon Sep 17 00:00:00 2001 From: Peter Garst Date: Thu, 27 Mar 2014 11:53:44 -0700 Subject: [PATCH 2/9] Style improvements and debugging in the docx indexing code --- src/calibre/ebooks/docx/index.py | 136 +++++++++++++++---------------- 1 file changed, 67 insertions(+), 69 deletions(-) diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py index 23865210a0..45490849c4 100644 --- a/src/calibre/ebooks/docx/index.py +++ b/src/calibre/ebooks/docx/index.py @@ -19,7 +19,7 @@ import lxml.etree NBSP = '\xa0' -class Location: +class Location(object): """ This class represents one location in the index. We should provide a way to mark the main entries. Libre office @@ -33,7 +33,7 @@ class Location: self.bookmark = bookmark self.target = target -class Entry: +class Entry(object): """ This class represents one index entry. We can also have a list of subentries for the primary/secondary @@ -48,17 +48,17 @@ class Entry: self.name = name self.index = index - def addEntry(self, entry, sub): + def add_entry(self, entry, sub): """ The entry has the form [xxx, field, bookmark, target] """ if len(sub) == 0: self.locations.append(Location(entry[2], entry[3])) else: - sube = Index.findEntry(sub[0], self.subentries, self.index) - sube.addEntry(entry, sub[1:]) + sube = find_entry(sub[0], self.subentries, self.index) + sube.add_entry(entry, sub[1:]) - def makeLink(self, loc, amap): + def make_link(self, loc, amap): # As a first pass, we just put a placeholder in the target location # We want it to float right markid = amap[loc.bookmark] @@ -74,41 +74,41 @@ class Entry: setattr(text.elem, text.attr, ''.join(text.buf)) return span - def toHtmlUnit(self, body, level, amap): + def to_htmlunit(self, body, level, amap): """ Append the material for one index entry to the document. There is a name, and 0 or more locations. Put the first location, if any, on the same line as the name, and others on following lines. """ - style = self.index.entryStyles[level] - main = Index.addName(self.name, style) + style = self.index.entry_styles[level] + main = add_name(self.name, style) if len(self.locations) == 0: body.append(main) return # First link on same line as name - link = self.makeLink(self.locations[0], amap) + link = self.make_link(self.locations[0], amap) main.append(link) body.append(main) # Put other links for same entry on their own lines # To keep the link span separate need to put a space as the name for l in self.locations[1:]: - link = self.makeLink(l, amap) + link = self.make_link(l, amap) dest = P() dest.set('class', style) dest.text = NBSP dest.append(link) body.append(dest) - def toHtml(self, body, level, amap): + def to_html(self, body, level, amap): level = min(level, 2) - self.toHtmlUnit(body, level, amap) + self.to_htmlunit(body, level, amap) for key in sorted(self.subentries.keys()): - self.subentries[key].toHtml(body, level + 1, amap) + self.subentries[key].to_html(body, level + 1, amap) -class Section: +class Section(object): """ This class represents one section of the index - usually, for example, the A's or the B's. @@ -119,7 +119,7 @@ class Section: self.index = index self.entries = {} - def addEntry(self, entry): + def add_entry(self, entry): """ We have information from one index marker. The entry has form [name, field, bookmark, target]. @@ -128,19 +128,19 @@ class Section: location to it; otherwise create a new entry. """ topics = entry[0].strip('"').split(':') - targ = Index.findEntry(topics[0], self.entries, self.index) - targ.addEntry(entry, topics[1:]) + targ = find_entry(topics[0], self.entries, self.index) + targ.add_entry(entry, topics[1:]) - def toHtml(self, key, body, amap): + def to_html(self, key, body, amap): """ Add one section of the index to the html """ if len(key) > 0: - body.append(Index.addName(key, self.index.sectionStyle)) + body.append(add_name(key, self.index.sectionStyle)) for ekey in sorted(self.entries.keys()): - self.entries[ekey].toHtml(body, 0, amap) + self.entries[ekey].to_html(body, 0, amap) -class Index: +class Index(object): """ This class generates an alphabetical index from the index markers in a docx file. @@ -166,24 +166,24 @@ class Index: self.convert = convert self.sections = {} - self.genStyles() + self.gen_styles() # Get a list of [name, field] entries, where name is the index # entry and field is the indexed location - self.entries = self.getEntries() + self.entries = self.get_entries() # Find styles which are provide the text for links. - self.targetStyles() + self.target_styles() # Generate bookmarks in the document at the indexed locations self.bookmarks() # Set up the entries in index sections for unit in self.entries: - sec = self.findSection(unit[0]) - sec.addEntry(unit) + sec = self.find_section(unit[0]) + sec.add_entry(unit) - def getEntries(self): + def get_entries(self): """ We already have a list of fields which includes the index marks, identified by an XE tag. @@ -204,9 +204,9 @@ class Index: # Only want the index entries fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields) - return map(lambda f: [self.getEntry(f), f], fields) + return map(lambda f: [self.get_entry(f), f], fields) - def getEntry(self, field): + def get_entry(self, field): elist = [field.instructions[0][1]] for inst in field.instructions[1:]: @@ -220,7 +220,7 @@ class Index: sep2 = sep1[2].partition('"') return sep2[0] - def targetStyles(self): + def target_styles(self): """ We want to get a list of styles which represent valid index targets. That is, the text of a link in the index will be the title of the @@ -234,9 +234,9 @@ class Index: jumped in earlier and could map it to the original docx styles. """ smap = self.convert.styles.id_map - self.targstyles = [name for name, style in smap.iteritems() if style.name.startswith('Heading')] + self.targstyles = [name for name, style in smap.iteritems() if style.name.lower().startswith('heading')] - def isHeading(self, node): + def is_heading(self, node): """ Return true if the input node is a valid index link target. """ @@ -253,14 +253,14 @@ class Index: style = sn.get(k[0]) return style in self.targstyles - def getHeadings(self, node): + def get_headings(self, node): """ Get a list of all children of the input node which are headings - that is, valid targets for an index link """ answer = [] for c in node.getchildren(): - if self.isHeading(c): + if self.is_heading(c): answer.append(c) return answer @@ -290,7 +290,7 @@ class Index: original names. """ pnode = ancestor(node, 'w:p') - if self.isHeading(pnode): + if self.is_heading(pnode): return self.textValue(pnode) while True: @@ -300,7 +300,7 @@ class Index: # Maintain document order in these lists pindex = parent.index(pnode) - hlist = self.getHeadings(parent) + hlist = self.get_headings(parent) hlist = filter(lambda x: parent.index(x) < pindex, hlist) if len(hlist) > 0: return self.textValue(hlist[-1]) @@ -344,7 +344,7 @@ class Index: targnode = self.findTarget(rnode) entry.append(targnode) - def genStyles(self): + def gen_styles(self): """ Generate css styles for the index elements. We do title, section header, and three levels of entries. @@ -360,13 +360,13 @@ class Index: css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')]) self.sectionStyle = self.convert.styles.register(css, 'block') - self.entryStyles = [] + self.entry_styles = [] for i in range(3): indent = str(i*20) + 'pt' css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)]) - self.entryStyles.append(self.convert.styles.register(css, 'block')) + self.entry_styles.append(self.convert.styles.register(css, 'block')) - def findSection(self, tag): + def find_section(self, tag): """ Find the section for this index entry, creating it if required. The tag has a form like A or A:B or etc. @@ -393,35 +393,33 @@ class Index: This method writes it into the html. """ body = self.convert.body - body.append(Index.addName('Index', self.titleStyle)) + body.append(add_name('Index', self.titleStyle)) # And write them to the html for key in sorted(self.sections.keys()): - self.sections[key].toHtml(key, body, self.convert.anchor_map) + self.sections[key].to_html(key, body, self.convert.anchor_map) - @staticmethod - def addName(str, clname): - # Put this into the convert document map? - dest = P() - dest.set('class', clname) - span = SPAN() - from calibre.ebooks.docx.to_html import Text - text = Text(span, 'text', []) - text.buf.append(str) - setattr(text.elem, text.attr, ''.join(text.buf)) - dest.append(span) - return dest - - @staticmethod - def findEntry(value, dict, index): - """ - Find the Entry in the dictionary, or create a new one. - We convert to lower case to group all capitalizations - together as a single entry. - """ - lvalue = value.lower() - if lvalue in dict: - return dict[lvalue] - ent = Entry(value, index) - dict[lvalue] = ent - return ent +def add_name(str, clname): + # Put this into the convert document map? + dest = P() + dest.set('class', clname) + span = SPAN() + from calibre.ebooks.docx.to_html import Text + text = Text(span, 'text', []) + text.buf.append(str) + setattr(text.elem, text.attr, ''.join(text.buf)) + dest.append(span) + return dest + +def find_entry(value, dict, index): + """ + Find the Entry in the dictionary, or create a new one. + We convert to lower case to group all capitalizations + together as a single entry. + """ + lvalue = value.lower() + if lvalue in dict: + return dict[lvalue] + ent = Entry(value, index) + dict[lvalue] = ent + return ent From 2f3727956b9113e7f6722f231c7a1f21170080b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 19:10:40 +0530 Subject: [PATCH 3/9] Stylistic cleanups --- src/calibre/ebooks/docx/container.py | 3 +- src/calibre/ebooks/docx/fields.py | 2 - src/calibre/ebooks/docx/index.py | 91 +++++++++++++--------------- 3 files changed, 44 insertions(+), 52 deletions(-) diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index 82b3fc9f4b..deaf5bd4d0 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -22,8 +22,7 @@ from calibre.utils.zipfile import ZipFile from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER def fromstring(raw, parser=RECOVER_PARSER): - res = etree.fromstring(raw, parser=parser) - return res + return etree.fromstring(raw, parser=parser) # Read metadata {{{ def read_doc_props(raw, mi): diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 189927a6c6..4dccfa5914 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -10,8 +10,6 @@ import re from calibre.ebooks.docx.names import XPath, get -import sys - class Field(object): def __init__(self, start): diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py index 45490849c4..0ec2902da2 100644 --- a/src/calibre/ebooks/docx/index.py +++ b/src/calibre/ebooks/docx/index.py @@ -6,21 +6,18 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -import itertools from collections import OrderedDict -from lxml import html -from lxml.html.builder import ( - HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, SUP, A, DT, DL, DD, H1) -from calibre.ebooks.docx.names import ( - XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, - ancestor, descendants, namespaces, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS) +from lxml.html.builder import A, SPAN import lxml.etree +from calibre.ebooks.docx.names import XPath, ancestor, namespaces + + NBSP = '\xa0' class Location(object): - """ + r""" This class represents one location in the index. We should provide a way to mark the main entries. Libre office has a main attribute, which doesn't seem to map to docx, and at least @@ -36,10 +33,10 @@ class Location(object): class Entry(object): """ This class represents one index entry. - We can also have a list of subentries for the primary/secondary + We can also have a list of sub-entries for the primary/secondary topic situation. Each entry has a list of locations we want to point to, but - it could be empty if this is only here to organize subentries. + it could be empty if this is only here to organize sub-entries. """ def __init__(self, name, index): @@ -47,7 +44,7 @@ class Entry(object): self.locations = [] self.name = name self.index = index - + def add_entry(self, entry, sub): """ The entry has the form [xxx, field, bookmark, target] @@ -62,7 +59,7 @@ class Entry(object): # As a first pass, we just put a placeholder in the target location # We want it to float right markid = amap[loc.bookmark] - if markid == None: + if markid is None: return span = A() @@ -130,13 +127,13 @@ class Section(object): topics = entry[0].strip('"').split(':') targ = find_entry(topics[0], self.entries, self.index) targ.add_entry(entry, topics[1:]) - + def to_html(self, key, body, amap): """ Add one section of the index to the html """ if len(key) > 0: - body.append(add_name(key, self.index.sectionStyle)) + body.append(add_name(key, self.index.section_style)) for ekey in sorted(self.entries.keys()): self.entries[ekey].to_html(body, 0, amap) @@ -145,7 +142,7 @@ class Index(object): This class generates an alphabetical index from the index markers in a docx file. Each field in the parse of the docx file contains an instructions list. - Instructions with name XE are index instructions. + Instructions with name XE are index instructions. The instruction also contains the entry specifier, of the form A[:B[:C]] for main entry, A, subentry B, and so on. @@ -184,10 +181,10 @@ class Index(object): sec.add_entry(unit) def get_entries(self): - """ + r""" We already have a list of fields which includes the index marks, identified by an XE tag. - In the base case, the field object includes an instruction list + In the base case, the field object includes an instruction list with one tuple like ('XE', '"entry"'), where entry is the text we want to put in the index. Note the double quotes around the entry. Sometimes the entry is broken up in the document, for example if @@ -202,28 +199,27 @@ class Index(object): """ fields = self.convert.fields.fields + def get_entry(field): + elist = [field.instructions[0][1]] + for inst in field.instructions[1:]: + elist.append(inst[0]) + elist.append(inst[1]) + + entry = ''.join(elist) + sep1 = entry.partition('"') + if sep1[2] == '': + return entry + sep2 = sep1[2].partition('"') + return sep2[0] + # Only want the index entries - fields = filter(lambda f: len(f.instructions) > 0 and f.instructions[0][0] == 'XE', fields) - return map(lambda f: [self.get_entry(f), f], fields) - - def get_entry(self, field): - - elist = [field.instructions[0][1]] - for inst in field.instructions[1:]: - elist.append(inst[0]) - elist.append(inst[1]) - - entry = ''.join(elist) - sep1 = entry.partition('"') - if sep1[2] == '': - return entry - sep2 = sep1[2].partition('"') - return sep2[0] + return [[get_entry(f), f] for f in fields + if f.instructions and f.instructions[0][0] == 'XE'] def target_styles(self): """ We want to get a list of styles which represent valid index targets. - That is, the text of a link in the index will be the title of the + That is, the text of a link in the index will be the title of the section of the document containing the indexed location. We want the list of styles which can provide a valid title. In practice, this maps to Heading1 through Heading3 in the original document. @@ -242,7 +238,7 @@ class Index(object): """ snodes = XPath("./w:pPr/w:pStyle")(node) if len(snodes) == 0: - return False; + return False sn = snodes[0] @@ -264,14 +260,13 @@ class Index(object): answer.append(c) return answer - def textValue(self, node): + def text_value(self, node): tnodes = XPath("./w:r/w:t")(node) if len(tnodes) == 0: return 'Link' - textl = map(lambda x: x.text, tnodes) - return ''.join(textl) + return ''.join((x.text or '') for x in tnodes) - def findTarget(self, node): + def find_target(self, node): """ Given an index entry, find the text of the last heading section preceding the entry. @@ -279,7 +274,7 @@ class Index(object): return the text. Otherwise, go up the document level by level, staring with the parent of the w:p element containing the entry. - At each level, get the list of heading w:p elements which are + At each level, get the list of heading w:p elements which are children of the top node. We also have the index in the top node of the child node containing the entry. Find the largest index of a heading child which is < the entry @@ -291,11 +286,11 @@ class Index(object): """ pnode = ancestor(node, 'w:p') if self.is_heading(pnode): - return self.textValue(pnode) + return self.text_value(pnode) while True: parent = pnode.getparent() - if parent == None: + if parent is None: return 'Link' # Maintain document order in these lists @@ -303,7 +298,7 @@ class Index(object): hlist = self.get_headings(parent) hlist = filter(lambda x: parent.index(x) < pindex, hlist) if len(hlist) > 0: - return self.textValue(hlist[-1]) + return self.text_value(hlist[-1]) # Try again pnode = parent @@ -341,7 +336,7 @@ class Index(object): # We should make the targets configurable, and add chapter # titles and maybe other things. # What about numbering? - targnode = self.findTarget(rnode) + targnode = self.find_target(rnode) entry.append(targnode) def gen_styles(self): @@ -350,15 +345,15 @@ class Index(object): We do title, section header, and three levels of entries. These are reasonable styles which only set a couple of key values, but we could provide an interface to allow the user to set them. - Is there any problem registering the styles this early in the + Is there any problem registering the styles this early in the conversion process? """ # The result is a string we can use as a class name. css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')]) - self.titleStyle = self.convert.styles.register(css, 'block') + self.title_style = self.convert.styles.register(css, 'block') css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')]) - self.sectionStyle = self.convert.styles.register(css, 'block') + self.section_style = self.convert.styles.register(css, 'block') self.entry_styles = [] for i in range(3): @@ -393,7 +388,7 @@ class Index(object): This method writes it into the html. """ body = self.convert.body - body.append(add_name('Index', self.titleStyle)) + body.append(add_name('Index', self.title_style)) # And write them to the html for key in sorted(self.sections.keys()): From 4bede79eaefa7131e14ef42500f5385be371bfa6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 19:40:49 +0530 Subject: [PATCH 4/9] Initial stab at parsing XE fields robustly --- src/calibre/ebooks/docx/fields.py | 56 ++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 4dccfa5914..c2e2fac954 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -54,6 +54,23 @@ def parse_hyperlink(raw, log): last_option = None return ans +def parse_xe(raw, log): + ans = {} + last_option = None + raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02') + for token, token_type in scanner.scan(raw)[0]: + token = token.replace('\x01', '\\').replace('\x02', '"') + if token_type is FLAG: + last_option = {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}.get(token[1], None) + if last_option is not None: + ans[last_option] = None + elif token_type is WORD: + if last_option is None: + ans['text'] = token + else: + ans[last_option] = token + last_option = None + return ans class Fields(object): @@ -105,20 +122,35 @@ class Fields(object): for runs in all_runs: self.hyperlink_fields.append((hl, runs)) -def test_parse_hyperlink(): + # Parse XE fields + self.xe_fields = [] + for field in self.fields: + if len(field.instructions) >= 1 and field.instructions[0][0] == 'HYPERLINK': + xe = parse_xe(field.instructions[0][1], log) # TODO: Handle field with multiple instructions + if xe: + # TODO: parse the field contents + self.xe_fields.append(xe) + +def test_parse_fields(): import unittest - class TestParseHyperLink(unittest.TestCase): + class TestParseFields(unittest.TestCase): - def test_parsing(self): - self.assertEqual(parse_hyperlink( - r'\l anchor1', None), {'anchor':'anchor1'}) - self.assertEqual(parse_hyperlink( - r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'}) - self.assertEqual(parse_hyperlink( - r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'}) - self.assertEqual(parse_hyperlink( - r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'}) + def test_hyperlink(self): + ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y) + ae(r'\l anchor1', {'anchor':'anchor1'}) + ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'}) + ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'}) + ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'}) - suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink) + def test_xe(self): + ae = lambda x, y: self.assertEqual(parse_xe(x, None), y) + ae(r'"some name"', {'text':'some name'}) + ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None}) + ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'}) + + suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields) unittest.TextTestRunner(verbosity=4).run(suite) + +if __name__ == '__main__': + test_parse_fields() From b07752e389dd525378e690ed4e062523686c9ac6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 19:47:26 +0530 Subject: [PATCH 5/9] When parsing fields ignore unknown flags --- src/calibre/ebooks/docx/fields.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index c2e2fac954..2451465198 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -35,6 +35,7 @@ scanner = re.Scanner([ (r'\s+', None), ], flags=re.DOTALL) +null = object() def parse_hyperlink(raw, log): ans = {} @@ -43,7 +44,7 @@ def parse_hyperlink(raw, log): for token, token_type in scanner.scan(raw)[0]: token = token.replace('\x01', '\\').replace('\x02', '"') if token_type is FLAG: - last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None) + last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], null) if last_option is not None: ans[last_option] = None elif token_type is WORD: @@ -52,6 +53,7 @@ def parse_hyperlink(raw, log): else: ans[last_option] = token last_option = None + ans.pop(null, None) return ans def parse_xe(raw, log): @@ -61,7 +63,7 @@ def parse_xe(raw, log): for token, token_type in scanner.scan(raw)[0]: token = token.replace('\x01', '\\').replace('\x02', '"') if token_type is FLAG: - last_option = {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}.get(token[1], None) + last_option = {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}.get(token[1], null) if last_option is not None: ans[last_option] = None elif token_type is WORD: @@ -70,6 +72,7 @@ def parse_xe(raw, log): else: ans[last_option] = token last_option = None + ans.pop(null, None) return ans class Fields(object): @@ -142,6 +145,7 @@ def test_parse_fields(): ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'}) ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'}) ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'}) + ae(r'xxxx \y yyyy', {'url': 'xxxx'}) def test_xe(self): ae = lambda x, y: self.assertEqual(parse_xe(x, None), y) From 0b98f34f84656628b58b1a2737486005683a9a7f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 20:47:58 +0530 Subject: [PATCH 6/9] DRY --- src/calibre/ebooks/docx/fields.py | 65 ++++++++++++++----------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 2451465198..24be0cad86 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -37,43 +37,36 @@ scanner = re.Scanner([ null = object() -def parse_hyperlink(raw, log): - ans = {} - last_option = None - raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02') - for token, token_type in scanner.scan(raw)[0]: - token = token.replace('\x01', '\\').replace('\x02', '"') - if token_type is FLAG: - last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], null) - if last_option is not None: - ans[last_option] = None - elif token_type is WORD: - if last_option is None: - ans['url'] = token - else: - ans[last_option] = token - last_option = None - ans.pop(null, None) - return ans +def parser(name, field_map, default_field_name): -def parse_xe(raw, log): - ans = {} - last_option = None - raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02') - for token, token_type in scanner.scan(raw)[0]: - token = token.replace('\x01', '\\').replace('\x02', '"') - if token_type is FLAG: - last_option = {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}.get(token[1], null) - if last_option is not None: - ans[last_option] = None - elif token_type is WORD: - if last_option is None: - ans['text'] = token - else: - ans[last_option] = token - last_option = None - ans.pop(null, None) - return ans + def parse(raw, log=None): + ans = {} + last_option = None + raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02') + for token, token_type in scanner.scan(raw)[0]: + token = token.replace('\x01', '\\').replace('\x02', '"') + if token_type is FLAG: + last_option = field_map.get(token[1], null) + if last_option is not None: + ans[last_option] = None + elif token_type is WORD: + if last_option is None: + ans[default_field_name] = token + else: + ans[last_option] = token + last_option = None + ans.pop(null, None) + return ans + + parse.__name__ = str('parse_' + name) + + return parse + +parse_hyperlink = parser('hyperlink', + {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}, 'url') + +parse_xe = parser('xe', + {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}, 'text') class Fields(object): From 5d4c4b857e47bff419e06d64f18112b92fee3b6f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 21:21:16 +0530 Subject: [PATCH 7/9] Parse INDEX fields --- src/calibre/ebooks/docx/fields.py | 96 ++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 24be0cad86..90e80423ff 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -37,7 +37,9 @@ scanner = re.Scanner([ null = object() -def parser(name, field_map, default_field_name): +def parser(name, field_map, default_field_name=None): + + field_map = dict((x.split(':') for x in field_map.split())) def parse(raw, log=None): ans = {} @@ -63,10 +65,15 @@ def parser(name, field_map, default_field_name): return parse parse_hyperlink = parser('hyperlink', - {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}, 'url') + 'l:anchor m:image-map n:target o:title t:target', 'url') parse_xe = parser('xe', - {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}, 'text') + 'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text') + +parse_index = parser('index', + 'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator' + ' f:entry-type g:page-range-separator h:heading k:crossref-separator' + ' p:page-number-separator r:run-together y:yomi z:langcode') class Fields(object): @@ -94,38 +101,56 @@ class Fields(object): if stack: stack[-1].contents.append(elem) - # Parse hyperlink fields - self.hyperlink_fields = [] - for field in self.fields: - if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK': - hl = parse_hyperlink(field.instructions[0][1], log) - if hl: - if 'target' in hl and hl['target'] is None: - hl['target'] = '_blank' - all_runs = [] - current_runs = [] - # We only handle spans in a single paragraph - # being wrapped in - for x in field.contents: - if x.tag.endswith('}p'): - if current_runs: - all_runs.append(current_runs) - current_runs = [] - elif x.tag.endswith('}r'): - current_runs.append(x) - if current_runs: - all_runs.append(current_runs) - for runs in all_runs: - self.hyperlink_fields.append((hl, runs)) + field_types = ('hyperlink', 'xe', 'index') + parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types} + field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types} + + for f in field_types: + setattr(self, '%s_fields' % f, []) - # Parse XE fields - self.xe_fields = [] for field in self.fields: - if len(field.instructions) >= 1 and field.instructions[0][0] == 'HYPERLINK': - xe = parse_xe(field.instructions[0][1], log) # TODO: Handle field with multiple instructions - if xe: - # TODO: parse the field contents - self.xe_fields.append(xe) + if field.instructions: + name = field.instructions[0][0] + func = parsers.get(name, None) + if func is not None: + func(field, field_parsers[name], log) + + def parse_hyperlink(self, field, parse_func, log): + # Parse hyperlink fields + if len(field.instructions) == 1: + hl = parse_func(field.instructions[0][1], log) + if hl: + if 'target' in hl and hl['target'] is None: + hl['target'] = '_blank' + all_runs = [] + current_runs = [] + # We only handle spans in a single paragraph + # being wrapped in + for x in field.contents: + if x.tag.endswith('}p'): + if current_runs: + all_runs.append(current_runs) + current_runs = [] + elif x.tag.endswith('}r'): + current_runs.append(x) + if current_runs: + all_runs.append(current_runs) + for runs in all_runs: + self.hyperlink_fields.append((hl, runs)) + + def parse_xe(self, field, parse_func, log): + # Parse XE fields + xe = parse_func(field.instructions[0][1], log) # TODO: Handle field with multiple instructions + if xe: + # TODO: parse the field contents + self.xe_fields.append(xe) + + def parse_index(self, field, parse_func, log): + # Parse Index fields + if len(field.instructions): + idx = parse_func(field.instructions[0][1], log) + # TODO: parse the field contents + self.index_fields.append(idx) def test_parse_fields(): import unittest @@ -146,6 +171,11 @@ def test_parse_fields(): ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None}) ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'}) + def test_index(self): + ae = lambda x, y: self.assertEqual(parse_index(x, None), y) + ae(r'', {}) + ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'}) + suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields) unittest.TextTestRunner(verbosity=4).run(suite) From a37fe4b71793185d8e296f5491065db7f42377a4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 21:32:09 +0530 Subject: [PATCH 8/9] Get rid of the option to control index generation Index will be generated automatically, wherever the INDEX field exists in the input docx file. --- .../ebooks/conversion/plugins/docx_input.py | 6 +----- src/calibre/ebooks/docx/index.py | 1 + src/calibre/ebooks/docx/to_html.py | 14 +++++--------- src/calibre/gui2/convert/docx_input.py | 2 +- src/calibre/gui2/convert/docx_input.ui | 7 ------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py index fe987ea28f..a998fd4678 100644 --- a/src/calibre/ebooks/conversion/plugins/docx_input.py +++ b/src/calibre/ebooks/conversion/plugins/docx_input.py @@ -19,15 +19,11 @@ class DOCXInput(InputFormatPlugin): help=_('Normally, if a large image is present at the start of the document that looks like a cover, ' 'it will be removed from the document and used as the cover for created ebook. This option ' 'turns off that behavior.')), - OptionRecommendation(name='docx_index', recommended_value=False, - help=_('If there are embedded index markers in the document, this option will use them to create ' - 'an alphabetical index with links to the locations of the markers.')), - } recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)]) def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.docx.to_html import Convert - return Convert(stream, detect_cover=not options.docx_no_cover, do_index=options.docx_index, log=log)() + return Convert(stream, detect_cover=not options.docx_no_cover, log=log)() diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py index 0ec2902da2..3e7e9dc42f 100644 --- a/src/calibre/ebooks/docx/index.py +++ b/src/calibre/ebooks/docx/index.py @@ -387,6 +387,7 @@ class Index(object): We generated the index object in the constructor. This method writes it into the html. """ + # TODO: Only do this at locations of the INDEX field in the document body = self.convert.body body.append(add_name('Index', self.title_style)) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 3b0be2febf..5a17d25eab 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -45,13 +45,12 @@ class Text: class Convert(object): - def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, do_index=False, notes_text=None): + def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None): self.docx = DOCX(path_or_stream, log=log) self.ms_pat = re.compile(r'\s{2,}') self.ws_pat = re.compile(r'[\n\r\t]') self.log = self.docx.log self.detect_cover = detect_cover - self.do_index = do_index self.notes_text = notes_text or _('Notes') self.dest_dir = dest_dir or os.getcwdu() self.mi = self.docx.metadata @@ -103,9 +102,7 @@ class Convert(object): # If we are doing an index, do the body part of the processing here. # We need to insert bookmarks at the indexed locations before the # main conversion work. - if self.do_index: - self.log.debug('Generating index') - index = Index(self) + index = Index(self) self.read_page_properties(doc) self.current_rels = relationships_by_id @@ -168,14 +165,13 @@ class Convert(object): parent.text = tabs[-1].tail or '' map(parent.remove, tabs) - # For an index, we now want to append the index object - if self.do_index: - index.generate() - self.images.rid_map = orig_rid_map self.resolve_links() + # For an index, we now want to append the index object + index.generate() + self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map) diff --git a/src/calibre/gui2/convert/docx_input.py b/src/calibre/gui2/convert/docx_input.py index 0fbfb66634..46234c6a36 100644 --- a/src/calibre/gui2/convert/docx_input.py +++ b/src/calibre/gui2/convert/docx_input.py @@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['docx_no_cover', 'docx_index', ]) + ['docx_no_cover', ]) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/docx_input.ui b/src/calibre/gui2/convert/docx_input.ui index 8f3843dd2a..41948118dc 100644 --- a/src/calibre/gui2/convert/docx_input.ui +++ b/src/calibre/gui2/convert/docx_input.ui @@ -21,13 +21,6 @@ - - - - Generate an alphabetical index from embedded index markers - - - From 8a275f7defa57be6d7318608d8b4508d0950cc6e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 21:49:17 +0530 Subject: [PATCH 9/9] Temporarily disable index generation pending refactoring --- src/calibre/ebooks/conversion/plugins/docx_input.py | 1 + src/calibre/ebooks/docx/to_html.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/docx_input.py b/src/calibre/ebooks/conversion/plugins/docx_input.py index a998fd4678..30a4bb3868 100644 --- a/src/calibre/ebooks/conversion/plugins/docx_input.py +++ b/src/calibre/ebooks/conversion/plugins/docx_input.py @@ -19,6 +19,7 @@ class DOCXInput(InputFormatPlugin): help=_('Normally, if a large image is present at the start of the document that looks like a cover, ' 'it will be removed from the document and used as the cover for created ebook. This option ' 'turns off that behavior.')), + } recommendations = set([('page_breaks_before', '/', OptionRecommendation.MED)]) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 5a17d25eab..9d09e423f2 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -28,7 +28,7 @@ from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.toc import create_toc from calibre.ebooks.docx.fields import Fields from calibre.ebooks.docx.settings import Settings -from calibre.ebooks.docx.index import Index +# from calibre.ebooks.docx.index import Index from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 @@ -102,7 +102,7 @@ class Convert(object): # If we are doing an index, do the body part of the processing here. # We need to insert bookmarks at the indexed locations before the # main conversion work. - index = Index(self) + # index = Index(self) self.read_page_properties(doc) self.current_rels = relationships_by_id @@ -170,7 +170,7 @@ class Convert(object): self.resolve_links() # For an index, we now want to append the index object - index.generate() + # index.generate() self.styles.cascade(self.layers)