Merge branch 'index'

2025-11-20 13:33:02 -05:00 · 2014-03-30 21:52:46 +05:30 · 2014-03-30 21:52:46 +05:30 · c2d94d518f
commit c2d94d518f
parent 4a79e4a7b1 8a275f7def
3 changed files with 545 additions and 52 deletions
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -16,6 +16,7 @@ class Field(object):
        self.start = start
        self.end = None
        self.contents = []
        self.elements = []
        self.instructions = []
    def add_instr(self, elem):
@ -24,6 +25,7 @@ class Field(object):
            return
        name, rest = raw.strip().partition(' ')[0::2]
        self.instructions.append((name, rest.strip()))
        self.elements.append(elem)
 WORD, FLAG = 0, 1
 scanner = re.Scanner([
@ -33,25 +35,45 @@ scanner = re.Scanner([
    (r'\s+', None),
 ], flags=re.DOTALL)
 null = object()
-def parse_hyperlink(raw, log):
+def parser(name, field_map, default_field_name=None):
    field_map = dict((x.split(':') for x in field_map.split()))
    def parse(raw, log=None):
        ans = {}
        last_option = None
        raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
        for token, token_type in scanner.scan(raw)[0]:
            token = token.replace('\x01', '\\').replace('\x02', '"')
            if token_type is FLAG:
-            last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
+                last_option = field_map.get(token[1], null)
                if last_option is not None:
                    ans[last_option] = None
            elif token_type is WORD:
                if last_option is None:
-                ans['url'] = token
+                    ans[default_field_name] = token
                else:
                    ans[last_option] = token
                    last_option = None
        ans.pop(null, None)
        return ans
    parse.__name__ = str('parse_' + name)
    return parse
 parse_hyperlink = parser('hyperlink',
    'l:anchor m:image-map n:target o:title t:target', 'url')
 parse_xe = parser('xe',
    'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
 parse_index = parser('index',
    'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
    ' f:entry-type g:page-range-separator h:heading k:crossref-separator'
    ' p:page-number-separator r:run-together y:yomi z:langcode')
 class Fields(object):
@ -79,11 +101,24 @@ class Fields(object):
                if stack:
                    stack[-1].contents.append(elem)
-        # Parse hyperlink fields
+        field_types = ('hyperlink', 'xe', 'index')
-        self.hyperlink_fields = []
+        parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
        field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
        for f in field_types:
            setattr(self, '%s_fields' % f, [])
        for field in self.fields:
-            if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
+            if field.instructions:
-                hl = parse_hyperlink(field.instructions[0][1], log)
+                name = field.instructions[0][0]
                func = parsers.get(name, None)
                if func is not None:
                    func(field, field_parsers[name], log)
    def parse_hyperlink(self, field, parse_func, log):
        # Parse hyperlink fields
        if len(field.instructions) == 1:
            hl = parse_func(field.instructions[0][1], log)
            if hl:
                if 'target' in hl and hl['target'] is None:
                    hl['target'] = '_blank'
@ -103,20 +138,46 @@ class Fields(object):
                for runs in all_runs:
                    self.hyperlink_fields.append((hl, runs))
-def test_parse_hyperlink():
+    def parse_xe(self, field, parse_func, log):
        # Parse XE fields
        xe = parse_func(field.instructions[0][1], log)  # TODO: Handle field with multiple instructions
        if xe:
            # TODO: parse the field contents
            self.xe_fields.append(xe)
    def parse_index(self, field, parse_func, log):
        # Parse Index fields
        if len(field.instructions):
            idx = parse_func(field.instructions[0][1], log)
            # TODO: parse the field contents
            self.index_fields.append(idx)
 def test_parse_fields():
    import unittest
-    class TestParseHyperLink(unittest.TestCase):
+    class TestParseFields(unittest.TestCase):
-        def test_parsing(self):
+        def test_hyperlink(self):
-            self.assertEqual(parse_hyperlink(
+            ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
-                r'\l anchor1', None), {'anchor':'anchor1'})
+            ae(r'\l anchor1', {'anchor':'anchor1'})
-            self.assertEqual(parse_hyperlink(
+            ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
-                r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'})
+            ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
-            self.assertEqual(parse_hyperlink(
+            ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
-                r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
+            ae(r'xxxx \y yyyy', {'url': 'xxxx'})
            self.assertEqual(parse_hyperlink(
                r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'})
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink)
+        def test_xe(self):
            ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
            ae(r'"some name"', {'text':'some name'})
            ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
            ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
        def test_index(self):
            ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
            ae(r'', {})
            ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
    unittest.TextTestRunner(verbosity=4).run(suite)
 if __name__ == '__main__':
    test_parse_fields()
--- a/src/calibre/ebooks/docx/index.py
+++ b/src/calibre/ebooks/docx/index.py
@ -0,0 +1,421 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
 from collections import OrderedDict
 from lxml.html.builder import A, SPAN
 import lxml.etree
 from calibre.ebooks.docx.names import XPath, ancestor, namespaces
 NBSP = '\xa0'
 class Location(object):
    r"""
    This class represents one location in the index.
    We should provide a way to mark the main entries. Libre office
    has a main attribute, which doesn't seem to map to docx, and at least
    some versions of word can mark entries bold or italic with \b and \i.
    One index entry corresponds to a list of locations where the entry
    is referenced in the text.
    """
    def __init__(self, bookmark, target):
        self.bookmark = bookmark
        self.target = target
 class Entry(object):
    """
    This class represents one index entry.
    We can also have a list of sub-entries for the primary/secondary
    topic situation.
    Each entry has a list of locations we want to point to, but
    it could be empty if this is only here to organize sub-entries.
    """
    def __init__(self, name, index):
        self.subentries = {}
        self.locations = []
        self.name = name
        self.index = index
    def add_entry(self, entry, sub):
        """
        The entry has the form [xxx, field, bookmark, target]
        """
        if len(sub) == 0:
            self.locations.append(Location(entry[2], entry[3]))
        else:
            sube = find_entry(sub[0], self.subentries, self.index)
            sube.add_entry(entry, sub[1:])
    def make_link(self, loc, amap):
        # As a first pass, we just put a placeholder in the target location
        # We want it to float right
        markid = amap[loc.bookmark]
        if markid is None:
            return
        span = A()
        span.set('style', 'float:right')
        span.set('href', '#' + markid)
        from calibre.ebooks.docx.to_html import Text
        text = Text(span, 'text', [])
        text.buf.append(loc.target)
        setattr(text.elem, text.attr, ''.join(text.buf))
        return span
    def to_htmlunit(self, body, level, amap):
        """
        Append the material for one index entry to the document.
        There is a name, and 0 or more locations.
        Put the first location, if any, on the same line as the
        name, and others on following lines.
        """
        style = self.index.entry_styles[level]
        main = add_name(self.name, style)
        if len(self.locations) == 0:
            body.append(main)
            return
        # First link on same line as name
        link = self.make_link(self.locations[0], amap)
        main.append(link)
        body.append(main)
        # Put other links for same entry on their own lines
        # To keep the link span separate need to put a space as the name
        for l in self.locations[1:]:
            link = self.make_link(l, amap)
            dest = P()
            dest.set('class', style)
            dest.text = NBSP
            dest.append(link)
            body.append(dest)
    def to_html(self, body, level, amap):
        level = min(level, 2)
        self.to_htmlunit(body, level, amap)
        for key in sorted(self.subentries.keys()):
            self.subentries[key].to_html(body, level + 1, amap)
 class Section(object):
    """
    This class represents one section of the index - usually,
    for example, the A's or the B's.
    It is primarily a dictionary of entries.
    """
    def __init__(self, index):
        self.index = index
        self.entries = {}
    def add_entry(self, entry):
        """
        We have information from one index marker.
        The entry has form [name, field, bookmark, target].
        The name is something like A or A:B and so on.
        If we already have an entry for that name, just add the new
        location to it; otherwise create a new entry.
        """
        topics = entry[0].strip('"').split(':')
        targ = find_entry(topics[0], self.entries, self.index)
        targ.add_entry(entry, topics[1:])
    def to_html(self, key, body, amap):
        """
        Add one section of the index to the html
        """
        if len(key) > 0:
            body.append(add_name(key, self.index.section_style))
        for ekey in sorted(self.entries.keys()):
            self.entries[ekey].to_html(body, 0, amap)
 class Index(object):
    """
    This class generates an alphabetical index from the index markers in a docx file.
    Each field in the parse of the docx file contains an instructions list.
    Instructions with name XE are index instructions.
    The instruction also contains the entry specifier, of the form A[:B[:C]] for
    main entry, A, subentry B, and so on.
    The index object is a dictionary of sections, 'A' mapping to a section
    object with all the A entries, and so on. Each section in turn is a dictionary
    mapping an index specifier, like A:B, to a list of locations where that
    entry is referenced.
    We could make the formatting more configurable.
    Currently it uses fixed styles for the various elements, and a section
    heading for each letter.
    """
    def __init__(self, convert):
        """
        Convert the index markers in the document into an index object.
        """
        self.convert = convert
        self.sections = {}
        self.gen_styles()
        # Get a list of [name, field] entries, where name is the index
        # entry and field is the indexed location
        self.entries = self.get_entries()
        # Find styles which are provide the text for links.
        self.target_styles()
        # Generate bookmarks in the document at the indexed locations
        self.bookmarks()
        # Set up the entries in index sections
        for unit in self.entries:
            sec = self.find_section(unit[0])
            sec.add_entry(unit)
    def get_entries(self):
        r"""
        We already have a list of fields which includes the index marks,
        identified by an XE tag.
        In the base case, the field object includes an instruction list
        with one tuple like ('XE', '"entry"'), where entry is the text we
        want to put in the index. Note the double quotes around the entry.
        Sometimes the entry is broken up in the document, for example if
        there are spelling issues in the entry text.
        In this case, for reasons I don't understand, the instruction
        list includes a number of tuples, and we get the actual entry
        text by concatenating all of them after the initial tag.
        There can be formatting information in the instructions also, after
        the double quoted part, like '"entry" \b'.
        So, we want to concatenate all parts after the initial tag, and
        then get the part in double quotes.
        """
        fields = self.convert.fields.fields
        def get_entry(field):
            elist = [field.instructions[0][1]]
            for inst in field.instructions[1:]:
                elist.append(inst[0])
                elist.append(inst[1])
            entry = ''.join(elist)
            sep1 = entry.partition('"')
            if sep1[2] == '':
                return entry
            sep2 = sep1[2].partition('"')
            return sep2[0]
        # Only want the index entries
        return [[get_entry(f), f] for f in fields
                if f.instructions and f.instructions[0][0] == 'XE']
    def target_styles(self):
        """
        We want to get a list of styles which represent valid index targets.
        That is, the text of a link in the index will be the title of the
        section of the document containing the indexed location.
        We want the list of styles which can provide a valid title.
        In practice, this maps to Heading1 through Heading3 in the original document.
        Calibre apparently preprocesses docx files, so that a paragraph in
        the original with style Heading1 will now have a different, internal style.
        In this version we use convert.styles.id_map to find style ids
        with internal names beginning Heading; but I'd feel better if we
        jumped in earlier and could map it to the original docx styles.
        """
        smap = self.convert.styles.id_map
        self.targstyles = [name for name, style in smap.iteritems() if style.name.lower().startswith('heading')]
    def is_heading(self, node):
        """
        Return true if the input node is a valid index link target.
        """
        snodes = XPath("./w:pPr/w:pStyle")(node)
        if len(snodes) == 0:
            return False
        sn = snodes[0]
        # The key includes the long namespace information
        k = [key for key in sn.keys() if key.endswith('}val')]
        if len(k) == 0:
            return False
        style = sn.get(k[0])
        return style in self.targstyles
    def get_headings(self, node):
        """
        Get a list of all children of the input node which are headings -
        that is, valid targets for an index link
        """
        answer = []
        for c in node.getchildren():
            if self.is_heading(c):
                answer.append(c)
        return answer
    def text_value(self, node):
        tnodes = XPath("./w:r/w:t")(node)
        if len(tnodes) == 0:
            return 'Link'
        return ''.join((x.text or '') for x in tnodes)
    def find_target(self, node):
        """
        Given an index entry, find the text of the last heading section
        preceding the entry.
        To do this, find the containing w:p element. If it is a heading,
        return the text.
        Otherwise, go up the document level by level, staring with the
        parent of the w:p element containing the entry.
        At each level, get the list of heading w:p elements which are
        children of the top node. We also have the index in the top node
        of the child node containing the entry.
        Find the largest index of a heading child which is < the entry
        index, if any - that is the heading we want.
        Perhaps we should precalculate some of this.
        We could also consider doing some of this in xpath, but the style
        attributes have been modified, so we can't just look for the
        original names.
        """
        pnode = ancestor(node, 'w:p')
        if self.is_heading(pnode):
            return self.text_value(pnode)
        while True:
            parent = pnode.getparent()
            if parent is None:
                return 'Link'
            # Maintain document order in these lists
            pindex = parent.index(pnode)
            hlist = self.get_headings(parent)
            hlist = filter(lambda x: parent.index(x) < pindex, hlist)
            if len(hlist) > 0:
                return self.text_value(hlist[-1])
            # Try again
            pnode = parent
    def bookmarks(self):
        """
        For each index entry we need to insert a bookmark at the target location.
        These bookmarks are for our internal use - I'm not sure they would work well
        in the original docx document.
        For each entry we have the Field object, which includes the instrText
        element of the document.
        Try going to the parent, and inserting a bookmark start just before it.
        """
        bmno = 0
        for entry in self.entries:
            for instnode in entry[1].elements:
                name = 'indexBookmark' + str(bmno)
                bmno += 1
                tag = "{%s}bookmarkStart" % namespaces['w']
                att = "{%s}name" % namespaces['w']
                bookmark = lxml.etree.Element(tag)
                bookmark.set(att, name)
                rnode = instnode.getparent()
                # Add the name so that we can link to it
                entry.append(name)
                # insert the bookmark before rnode
                rparent = rnode.getparent()
                rind = rparent.index(rnode)
                rparent.insert(rind, bookmark)
                # We want the index entry to be the content of the closest
                # preceding Heading paragraph.
                # We should make the targets configurable, and add chapter
                # titles and maybe other things.
                # What about numbering?
                targnode = self.find_target(rnode)
                entry.append(targnode)
    def gen_styles(self):
        """
        Generate css styles for the index elements.
        We do title, section header, and three levels of entries.
        These are reasonable styles which only set a couple of key
        values, but we could provide an interface to allow the user to set them.
        Is there any problem registering the styles this early in the
        conversion process?
        """
        # The result is a string we can use as a class name.
        css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
        self.title_style = self.convert.styles.register(css, 'block')
        css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
        self.section_style = self.convert.styles.register(css, 'block')
        self.entry_styles = []
        for i in range(3):
            indent = str(i*20) + 'pt'
            css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
            self.entry_styles.append(self.convert.styles.register(css, 'block'))
    def find_section(self, tag):
        """
        Find the section for this index entry, creating it if required.
        The tag has a form like A or A:B or etc.
        If you want a single index without section divisions, you can
        just return the single section here every time.
        """
        shead = tag[0]
        # Make it lower case, and group all non-alphabetic things together
        if shead.isalpha():
            shead = shead.lower()
        else:
            shead = ''
        if shead in self.sections:
            return self.sections[shead]
        sect = Section(self)
        self.sections[shead] = sect
        return sect
    def generate(self):
        """
        We generated the index object in the constructor.
        This method writes it into the html.
        """
        # TODO: Only do this at locations of the INDEX field in the document
        body = self.convert.body
        body.append(add_name('Index', self.title_style))
        # And write them to the html
        for key in sorted(self.sections.keys()):
            self.sections[key].to_html(key, body, self.convert.anchor_map)
 def add_name(str, clname):
    # Put this into the convert document map?
    dest = P()
    dest.set('class', clname)
    span = SPAN()
    from calibre.ebooks.docx.to_html import Text
    text = Text(span, 'text', [])
    text.buf.append(str)
    setattr(text.elem, text.attr, ''.join(text.buf))
    dest.append(span)
    return dest
 def find_entry(value, dict, index):
    """
    Find the Entry in the dictionary, or create a new one.
    We convert to lower case to group all capitalizations
    together as a single entry.
    """
    lvalue = value.lower()
    if lvalue in dict:
        return dict[lvalue]
    ent = Entry(value, index)
    dict[lvalue] = ent
    return ent
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
 from calibre.ebooks.docx.toc import create_toc
 from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.docx.settings import Settings
 # from calibre.ebooks.docx.index import Index
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -97,6 +98,12 @@ class Convert(object):
        paras = []
        self.log.debug('Converting Word markup to HTML')
        # If we are doing an index, do the body part of the processing here.
        # We need to insert bookmarks at the indexed locations before the
        # main conversion work.
        # index = Index(self)
        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
@ -105,6 +112,7 @@ class Convert(object):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
@ -161,6 +169,9 @@ class Convert(object):
        self.resolve_links()
        # For an index, we now want to append the index object
        # index.generate()
        self.styles.cascade(self.layers)
        self.tables.apply_markup(self.object_map, self.page_map)