Merge branch 'index'

2025-06-23 15:30:45 -04:00 · 2014-03-30 21:52:46 +05:30 · 2014-03-30 21:52:46 +05:30 · c2d94d518f
commit c2d94d518f
parent 4a79e4a7b1 8a275f7def
3 changed files with 545 additions and 52 deletions
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -16,6 +16,7 @@ class Field(object):
        self.start = start
        self.end = None
        self.contents = []
+        self.elements = []
        self.instructions = []

    def add_instr(self, elem):
@ -24,6 +25,7 @@ class Field(object):
            return
        name, rest = raw.strip().partition(' ')[0::2]
        self.instructions.append((name, rest.strip()))
+        self.elements.append(elem)

 WORD, FLAG = 0, 1
 scanner = re.Scanner([
@ -33,25 +35,45 @@ scanner = re.Scanner([
    (r'\s+', None),
 ], flags=re.DOTALL)

+null = object()

-def parse_hyperlink(raw, log):
-    ans = {}
-    last_option = None
-    raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
-    for token, token_type in scanner.scan(raw)[0]:
-        token = token.replace('\x01', '\\').replace('\x02', '"')
-        if token_type is FLAG:
-            last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
-            if last_option is not None:
-                ans[last_option] = None
-        elif token_type is WORD:
-            if last_option is None:
-                ans['url'] = token
-            else:
-                ans[last_option] = token
-                last_option = None
-    return ans
+def parser(name, field_map, default_field_name=None):

+    field_map = dict((x.split(':') for x in field_map.split()))
+
+    def parse(raw, log=None):
+        ans = {}
+        last_option = None
+        raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
+        for token, token_type in scanner.scan(raw)[0]:
+            token = token.replace('\x01', '\\').replace('\x02', '"')
+            if token_type is FLAG:
+                last_option = field_map.get(token[1], null)
+                if last_option is not None:
+                    ans[last_option] = None
+            elif token_type is WORD:
+                if last_option is None:
+                    ans[default_field_name] = token
+                else:
+                    ans[last_option] = token
+                    last_option = None
+        ans.pop(null, None)
+        return ans
+
+    parse.__name__ = str('parse_' + name)
+
+    return parse
+
+parse_hyperlink = parser('hyperlink',
+    'l:anchor m:image-map n:target o:title t:target', 'url')
+
+parse_xe = parser('xe',
+    'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
+
+parse_index = parser('index',
+    'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
+    ' f:entry-type g:page-range-separator h:heading k:crossref-separator'
+    ' p:page-number-separator r:run-together y:yomi z:langcode')

 class Fields(object):

@ -79,44 +101,83 @@ class Fields(object):
                if stack:
                    stack[-1].contents.append(elem)

-        # Parse hyperlink fields
-        self.hyperlink_fields = []
-        for field in self.fields:
-            if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
-                hl = parse_hyperlink(field.instructions[0][1], log)
-                if hl:
-                    if 'target' in hl and hl['target'] is None:
-                        hl['target'] = '_blank'
-                    all_runs = []
-                    current_runs = []
-                    # We only handle spans in a single paragraph
-                    # being wrapped in <a>
-                    for x in field.contents:
-                        if x.tag.endswith('}p'):
-                            if current_runs:
-                                all_runs.append(current_runs)
-                            current_runs = []
-                        elif x.tag.endswith('}r'):
-                            current_runs.append(x)
-                    if current_runs:
-                        all_runs.append(current_runs)
-                    for runs in all_runs:
-                        self.hyperlink_fields.append((hl, runs))
+        field_types = ('hyperlink', 'xe', 'index')
+        parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
+        field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}

-def test_parse_hyperlink():
+        for f in field_types:
+            setattr(self, '%s_fields' % f, [])
+
+        for field in self.fields:
+            if field.instructions:
+                name = field.instructions[0][0]
+                func = parsers.get(name, None)
+                if func is not None:
+                    func(field, field_parsers[name], log)
+
+    def parse_hyperlink(self, field, parse_func, log):
+        # Parse hyperlink fields
+        if len(field.instructions) == 1:
+            hl = parse_func(field.instructions[0][1], log)
+            if hl:
+                if 'target' in hl and hl['target'] is None:
+                    hl['target'] = '_blank'
+                all_runs = []
+                current_runs = []
+                # We only handle spans in a single paragraph
+                # being wrapped in <a>
+                for x in field.contents:
+                    if x.tag.endswith('}p'):
+                        if current_runs:
+                            all_runs.append(current_runs)
+                        current_runs = []
+                    elif x.tag.endswith('}r'):
+                        current_runs.append(x)
+                if current_runs:
+                    all_runs.append(current_runs)
+                for runs in all_runs:
+                    self.hyperlink_fields.append((hl, runs))
+
+    def parse_xe(self, field, parse_func, log):
+        # Parse XE fields
+        xe = parse_func(field.instructions[0][1], log)  # TODO: Handle field with multiple instructions
+        if xe:
+            # TODO: parse the field contents
+            self.xe_fields.append(xe)
+
+    def parse_index(self, field, parse_func, log):
+        # Parse Index fields
+        if len(field.instructions):
+            idx = parse_func(field.instructions[0][1], log)
+            # TODO: parse the field contents
+            self.index_fields.append(idx)
+
+def test_parse_fields():
    import unittest

-    class TestParseHyperLink(unittest.TestCase):
+    class TestParseFields(unittest.TestCase):

-        def test_parsing(self):
-            self.assertEqual(parse_hyperlink(
-                r'\l anchor1', None), {'anchor':'anchor1'})
-            self.assertEqual(parse_hyperlink(
-                r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'})
-            self.assertEqual(parse_hyperlink(
-                r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
-            self.assertEqual(parse_hyperlink(
-                r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'})
+        def test_hyperlink(self):
+            ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
+            ae(r'\l anchor1', {'anchor':'anchor1'})
+            ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
+            ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
+            ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
+            ae(r'xxxx \y yyyy', {'url': 'xxxx'})

-    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink)
+        def test_xe(self):
+            ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
+            ae(r'"some name"', {'text':'some name'})
+            ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
+            ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
+
+        def test_index(self):
+            ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
+            ae(r'', {})
+            ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
+
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
    unittest.TextTestRunner(verbosity=4).run(suite)
+
+if __name__ == '__main__':
+    test_parse_fields()
--- a/src/calibre/ebooks/docx/index.py
+++ b/src/calibre/ebooks/docx/index.py
@ -0,0 +1,421 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from collections import OrderedDict
+
+from lxml.html.builder import A, SPAN
+import lxml.etree
+
+from calibre.ebooks.docx.names import XPath, ancestor, namespaces
+
+
+NBSP = '\xa0'
+
+class Location(object):
+    r"""
+    This class represents one location in the index.
+    We should provide a way to mark the main entries. Libre office
+    has a main attribute, which doesn't seem to map to docx, and at least
+    some versions of word can mark entries bold or italic with \b and \i.
+    One index entry corresponds to a list of locations where the entry
+    is referenced in the text.
+    """
+
+    def __init__(self, bookmark, target):
+        self.bookmark = bookmark
+        self.target = target
+
+class Entry(object):
+    """
+    This class represents one index entry.
+    We can also have a list of sub-entries for the primary/secondary
+    topic situation.
+    Each entry has a list of locations we want to point to, but
+    it could be empty if this is only here to organize sub-entries.
+    """
+
+    def __init__(self, name, index):
+        self.subentries = {}
+        self.locations = []
+        self.name = name
+        self.index = index
+
+    def add_entry(self, entry, sub):
+        """
+        The entry has the form [xxx, field, bookmark, target]
+        """
+        if len(sub) == 0:
+            self.locations.append(Location(entry[2], entry[3]))
+        else:
+            sube = find_entry(sub[0], self.subentries, self.index)
+            sube.add_entry(entry, sub[1:])
+
+    def make_link(self, loc, amap):
+        # As a first pass, we just put a placeholder in the target location
+        # We want it to float right
+        markid = amap[loc.bookmark]
+        if markid is None:
+            return
+
+        span = A()
+        span.set('style', 'float:right')
+        span.set('href', '#' + markid)
+        from calibre.ebooks.docx.to_html import Text
+        text = Text(span, 'text', [])
+        text.buf.append(loc.target)
+        setattr(text.elem, text.attr, ''.join(text.buf))
+        return span
+
+    def to_htmlunit(self, body, level, amap):
+        """
+        Append the material for one index entry to the document.
+        There is a name, and 0 or more locations.
+        Put the first location, if any, on the same line as the
+        name, and others on following lines.
+        """
+        style = self.index.entry_styles[level]
+        main = add_name(self.name, style)
+        if len(self.locations) == 0:
+            body.append(main)
+            return
+
+        # First link on same line as name
+        link = self.make_link(self.locations[0], amap)
+        main.append(link)
+        body.append(main)
+
+        # Put other links for same entry on their own lines
+        # To keep the link span separate need to put a space as the name
+        for l in self.locations[1:]:
+            link = self.make_link(l, amap)
+            dest = P()
+            dest.set('class', style)
+            dest.text = NBSP
+            dest.append(link)
+            body.append(dest)
+
+    def to_html(self, body, level, amap):
+        level = min(level, 2)
+        self.to_htmlunit(body, level, amap)
+        for key in sorted(self.subentries.keys()):
+            self.subentries[key].to_html(body, level + 1, amap)
+
+class Section(object):
+    """
+    This class represents one section of the index - usually,
+    for example, the A's or the B's.
+    It is primarily a dictionary of entries.
+    """
+
+    def __init__(self, index):
+        self.index = index
+        self.entries = {}
+
+    def add_entry(self, entry):
+        """
+        We have information from one index marker.
+        The entry has form [name, field, bookmark, target].
+        The name is something like A or A:B and so on.
+        If we already have an entry for that name, just add the new
+        location to it; otherwise create a new entry.
+        """
+        topics = entry[0].strip('"').split(':')
+        targ = find_entry(topics[0], self.entries, self.index)
+        targ.add_entry(entry, topics[1:])
+
+    def to_html(self, key, body, amap):
+        """
+        Add one section of the index to the html
+        """
+        if len(key) > 0:
+            body.append(add_name(key, self.index.section_style))
+        for ekey in sorted(self.entries.keys()):
+            self.entries[ekey].to_html(body, 0, amap)
+
+class Index(object):
+    """
+    This class generates an alphabetical index from the index markers in a docx file.
+
+    Each field in the parse of the docx file contains an instructions list.
+    Instructions with name XE are index instructions.
+    The instruction also contains the entry specifier, of the form A[:B[:C]] for
+    main entry, A, subentry B, and so on.
+
+    The index object is a dictionary of sections, 'A' mapping to a section
+    object with all the A entries, and so on. Each section in turn is a dictionary
+    mapping an index specifier, like A:B, to a list of locations where that
+    entry is referenced.
+
+    We could make the formatting more configurable.
+    Currently it uses fixed styles for the various elements, and a section
+    heading for each letter.
+    """
+
+    def __init__(self, convert):
+        """
+        Convert the index markers in the document into an index object.
+        """
+        self.convert = convert
+        self.sections = {}
+
+        self.gen_styles()
+
+        # Get a list of [name, field] entries, where name is the index
+        # entry and field is the indexed location
+        self.entries = self.get_entries()
+
+        # Find styles which are provide the text for links.
+        self.target_styles()
+
+        # Generate bookmarks in the document at the indexed locations
+        self.bookmarks()
+
+        # Set up the entries in index sections
+        for unit in self.entries:
+            sec = self.find_section(unit[0])
+            sec.add_entry(unit)
+
+    def get_entries(self):
+        r"""
+        We already have a list of fields which includes the index marks,
+        identified by an XE tag.
+        In the base case, the field object includes an instruction list
+        with one tuple like ('XE', '"entry"'), where entry is the text we
+        want to put in the index. Note the double quotes around the entry.
+        Sometimes the entry is broken up in the document, for example if
+        there are spelling issues in the entry text.
+        In this case, for reasons I don't understand, the instruction
+        list includes a number of tuples, and we get the actual entry
+        text by concatenating all of them after the initial tag.
+        There can be formatting information in the instructions also, after
+        the double quoted part, like '"entry" \b'.
+        So, we want to concatenate all parts after the initial tag, and
+        then get the part in double quotes.
+        """
+        fields = self.convert.fields.fields
+
+        def get_entry(field):
+            elist = [field.instructions[0][1]]
+            for inst in field.instructions[1:]:
+                elist.append(inst[0])
+                elist.append(inst[1])
+
+            entry = ''.join(elist)
+            sep1 = entry.partition('"')
+            if sep1[2] == '':
+                return entry
+            sep2 = sep1[2].partition('"')
+            return sep2[0]
+
+        # Only want the index entries
+        return [[get_entry(f), f] for f in fields
+                if f.instructions and f.instructions[0][0] == 'XE']
+
+    def target_styles(self):
+        """
+        We want to get a list of styles which represent valid index targets.
+        That is, the text of a link in the index will be the title of the
+        section of the document containing the indexed location.
+        We want the list of styles which can provide a valid title.
+        In practice, this maps to Heading1 through Heading3 in the original document.
+        Calibre apparently preprocesses docx files, so that a paragraph in
+        the original with style Heading1 will now have a different, internal style.
+        In this version we use convert.styles.id_map to find style ids
+        with internal names beginning Heading; but I'd feel better if we
+        jumped in earlier and could map it to the original docx styles.
+        """
+        smap = self.convert.styles.id_map
+        self.targstyles = [name for name, style in smap.iteritems() if style.name.lower().startswith('heading')]
+
+    def is_heading(self, node):
+        """
+        Return true if the input node is a valid index link target.
+        """
+        snodes = XPath("./w:pPr/w:pStyle")(node)
+        if len(snodes) == 0:
+            return False
+
+        sn = snodes[0]
+
+        # The key includes the long namespace information
+        k = [key for key in sn.keys() if key.endswith('}val')]
+        if len(k) == 0:
+            return False
+        style = sn.get(k[0])
+        return style in self.targstyles
+
+    def get_headings(self, node):
+        """
+        Get a list of all children of the input node which are headings -
+        that is, valid targets for an index link
+        """
+        answer = []
+        for c in node.getchildren():
+            if self.is_heading(c):
+                answer.append(c)
+        return answer
+
+    def text_value(self, node):
+        tnodes = XPath("./w:r/w:t")(node)
+        if len(tnodes) == 0:
+            return 'Link'
+        return ''.join((x.text or '') for x in tnodes)
+
+    def find_target(self, node):
+        """
+        Given an index entry, find the text of the last heading section
+        preceding the entry.
+        To do this, find the containing w:p element. If it is a heading,
+        return the text.
+        Otherwise, go up the document level by level, staring with the
+        parent of the w:p element containing the entry.
+        At each level, get the list of heading w:p elements which are
+        children of the top node. We also have the index in the top node
+        of the child node containing the entry.
+        Find the largest index of a heading child which is < the entry
+        index, if any - that is the heading we want.
+        Perhaps we should precalculate some of this.
+        We could also consider doing some of this in xpath, but the style
+        attributes have been modified, so we can't just look for the
+        original names.
+        """
+        pnode = ancestor(node, 'w:p')
+        if self.is_heading(pnode):
+            return self.text_value(pnode)
+
+        while True:
+            parent = pnode.getparent()
+            if parent is None:
+                return 'Link'
+
+            # Maintain document order in these lists
+            pindex = parent.index(pnode)
+            hlist = self.get_headings(parent)
+            hlist = filter(lambda x: parent.index(x) < pindex, hlist)
+            if len(hlist) > 0:
+                return self.text_value(hlist[-1])
+
+            # Try again
+            pnode = parent
+
+    def bookmarks(self):
+        """
+        For each index entry we need to insert a bookmark at the target location.
+        These bookmarks are for our internal use - I'm not sure they would work well
+        in the original docx document.
+        For each entry we have the Field object, which includes the instrText
+        element of the document.
+        Try going to the parent, and inserting a bookmark start just before it.
+        """
+        bmno = 0
+        for entry in self.entries:
+            for instnode in entry[1].elements:
+                name = 'indexBookmark' + str(bmno)
+                bmno += 1
+                tag = "{%s}bookmarkStart" % namespaces['w']
+                att = "{%s}name" % namespaces['w']
+                bookmark = lxml.etree.Element(tag)
+                bookmark.set(att, name)
+                rnode = instnode.getparent()
+
+                # Add the name so that we can link to it
+                entry.append(name)
+
+                # insert the bookmark before rnode
+                rparent = rnode.getparent()
+                rind = rparent.index(rnode)
+                rparent.insert(rind, bookmark)
+
+                # We want the index entry to be the content of the closest
+                # preceding Heading paragraph.
+                # We should make the targets configurable, and add chapter
+                # titles and maybe other things.
+                # What about numbering?
+                targnode = self.find_target(rnode)
+                entry.append(targnode)
+
+    def gen_styles(self):
+        """
+        Generate css styles for the index elements.
+        We do title, section header, and three levels of entries.
+        These are reasonable styles which only set a couple of key
+        values, but we could provide an interface to allow the user to set them.
+        Is there any problem registering the styles this early in the
+        conversion process?
+        """
+        # The result is a string we can use as a class name.
+        css = OrderedDict([('font-size', '20pt'), ('page-break-before', 'always')])
+        self.title_style = self.convert.styles.register(css, 'block')
+
+        css = OrderedDict([('font-size', '16pt'), ('margin-top', '20pt'), ('margin-bottom', '10pt')])
+        self.section_style = self.convert.styles.register(css, 'block')
+
+        self.entry_styles = []
+        for i in range(3):
+            indent = str(i*20) + 'pt'
+            css = OrderedDict([('margin-top', '0pt'), ('margin-bottom', '0pt'), ('margin-left', indent)])
+            self.entry_styles.append(self.convert.styles.register(css, 'block'))
+
+    def find_section(self, tag):
+        """
+        Find the section for this index entry, creating it if required.
+        The tag has a form like A or A:B or etc.
+        If you want a single index without section divisions, you can
+        just return the single section here every time.
+        """
+        shead = tag[0]
+
+        # Make it lower case, and group all non-alphabetic things together
+        if shead.isalpha():
+            shead = shead.lower()
+        else:
+            shead = ''
+
+        if shead in self.sections:
+            return self.sections[shead]
+        sect = Section(self)
+        self.sections[shead] = sect
+        return sect
+
+    def generate(self):
+        """
+        We generated the index object in the constructor.
+        This method writes it into the html.
+        """
+        # TODO: Only do this at locations of the INDEX field in the document
+        body = self.convert.body
+        body.append(add_name('Index', self.title_style))
+
+        # And write them to the html
+        for key in sorted(self.sections.keys()):
+            self.sections[key].to_html(key, body, self.convert.anchor_map)
+
+def add_name(str, clname):
+    # Put this into the convert document map?
+    dest = P()
+    dest.set('class', clname)
+    span = SPAN()
+    from calibre.ebooks.docx.to_html import Text
+    text = Text(span, 'text', [])
+    text.buf.append(str)
+    setattr(text.elem, text.attr, ''.join(text.buf))
+    dest.append(span)
+    return dest
+
+def find_entry(value, dict, index):
+    """
+    Find the Entry in the dictionary, or create a new one.
+    We convert to lower case to group all capitalizations
+    together as a single entry.
+    """
+    lvalue = value.lower()
+    if lvalue in dict:
+        return dict[lvalue]
+    ent = Entry(value, index)
+    dict[lvalue] = ent
+    return ent
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -28,6 +28,7 @@ from calibre.ebooks.docx.theme import Theme
 from calibre.ebooks.docx.toc import create_toc
 from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.docx.settings import Settings
+# from calibre.ebooks.docx.index import Index
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

@ -97,6 +98,12 @@ class Convert(object):
        paras = []

        self.log.debug('Converting Word markup to HTML')
+
+        # If we are doing an index, do the body part of the processing here.
+        # We need to insert bookmarks at the indexed locations before the
+        # main conversion work.
+        # index = Index(self)
+
        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
@ -105,6 +112,7 @@ class Convert(object):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
+
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
@ -161,6 +169,9 @@ class Convert(object):

        self.resolve_links()

+        # For an index, we now want to append the index object
+        # index.generate()
+
        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)