From e263479ba1202e9ac8227bc5d1691150aaa50462 Mon Sep 17 00:00:00 2001
From: Peter Garst <pgarst@gmail.com>
Date: Thu, 24 Apr 2014 20:11:56 -0700
Subject: [PATCH] Debugging in index facility

---
 src/calibre/ebooks/docx/index.py | 113 +++++++++++++++++++++++++------
 1 file changed, 92 insertions(+), 21 deletions(-)

diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py
index 5d19721b43..6cac4f0165 100644
--- a/src/calibre/ebooks/docx/index.py
+++ b/src/calibre/ebooks/docx/index.py
@@ -119,7 +119,7 @@ def process_index(field, index, xe_fields, log):
 
     return hyperlinks, blocks
 
-def split_up_block(block, a, text, parts):
+def split_up_block(block, a, text, parts, ldict):
     prefix = parts[:-1]
     a.text = parts[-1]
     parent = a.getparent()
@@ -127,31 +127,100 @@ def split_up_block(block, a, text, parts):
     for i, prefix in enumerate(prefix):
         m = 1.5 * i
         span = parent.makeelement('span', style=style % m)
+        ldict[span]    = i
         parent.append(span)
         span.text = prefix
     span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
     parent.append(span)
     span.append(a)
+    ldict[span]    = len(prefix)
 
-def merge_blocks(prev_block, next_block, prev_path, next_path):
-    pa, na = prev_block.xpath('descendant::a'), next_block.xpath('descendant::a[1]')
-    if not pa or not na:
-        return
-    pa, na = pa[-1], na[0]
-    if prev_path == next_path:
+"""
+The merge algorithm is a little tricky.
+We start with a list of elementary blocks. Each is an HtmlElement, a p node
+with a list of child nodes. The last child is a link, and the earlier ones are 
+just text.
+The list is in reverse order from what we want in the index.
+There is a dictionary ldict which records the level of each child node.
+
+Now we want to do a reduce-like operation, combining all blocks with the same
+top level index entry into a single block representing the structure of all
+references, subentries, etc. under that top entry.
+Here's the algorithm.
+
+Given a block p and the next block n, and the top level entries p1 and n1 in each
+block, which we assume have the same text:
+
+Start with (p, p1) and (n, n1).
+
+Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
+
+If there are no more levels in n, then add the link from nk to the links for pk.
+This might be the first link for pk, or we might get a list of references.
+
+Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
+the same text, it must follow pk, it must come before we find any other p entries at 
+the same level as pk, and it must have the same level as nk+1.
+
+If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
+
+If there is no matching entry, then because of the original reversed order we want
+to insert nk+1 and all following entries from n into p immediately following pk.
+"""
+
+def find_match(prev_block, pind, nextent, ldict):
+    curlevel = ldict[prev_block[pind]]
+    for p in range(pind+1, len(prev_block)):
+        trylev = ldict[prev_block[p]]
+        if trylev <= curlevel:
+            return -1
+        if trylev > (curlevel+1):
+            continue
+        if prev_block[p].text_content() == nextent.text_content():
+            return p
+    return -1
+
+def add_link(pent, nent, ldict):
+    na = nent.xpath('descendant::a[1]')
+    na = na[0]
+    pa = pent.xpath('descendant::a')
+    if pa and len(pa) > 0:
         # Put on same line with a comma
+        pa = pa[-1]
         pa.tail = ', '
         p = pa.getparent()
         p.insert(p.index(pa) + 1, na)
     else:
-        # Add a line to the previous block
-        ps, ns = pa.getparent(), na.getparent()
-        p = ps.getparent()
-        p.insert(p.index(ps) + 1, ns)
+        # substitute link na for plain text in pent
+        pent.text = ""
+        pent.append(na)
+
+def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
+    # First elements match. Any more in next?
+    if len(next_path) == (nind + 1):
+        nextent = next_block[nind]
+        add_link(prev_block[pind], nextent, ldict)
+        return
+
+    nind = nind + 1
+    nextent = next_block[nind]
+    prevent = find_match(prev_block, pind, nextent, ldict)
+    if prevent > 0:
+        merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
+        return
+    
+    # Want to insert elements into previous block
+    while nind < len(next_block):
+        # insert takes it out of old
+        pind = pind + 1
+        prev_block.insert(pind, next_block[nind])
+
     next_block.getparent().remove(next_block)
 
 def polish_index_markup(index, blocks):
+    # Blocks are in reverse order at this point
     path_map = {}
+    ldict = {}
     for block in blocks:
         cls = block.get('class', '') or ''
         block.set('class', (cls + ' index-entry').lstrip())
@@ -162,20 +231,22 @@ def polish_index_markup(index, blocks):
         if ':' in text:
             path_map[block] = parts = filter(None, (x.strip() for x in text.split(':')))
             if len(parts) > 1:
-                split_up_block(block, a[0], text, parts)
+                split_up_block(block, a[0], text, parts, ldict)
         else:
+            # try using a span all the time
             path_map[block] = [text]
+            parent = a[0].getparent()
+            span = parent.makeelement('span', style='display:block; margin-left: 0em')
+            parent.append(span)
+            span.append(a[0])
+            ldict[span] = 0
 
+    # We want a single block for each main entry
     prev_block = blocks[0]
     for block in blocks[1:]:
         pp, pn = path_map[prev_block], path_map[block]
-        if pp == pn:
-            merge_blocks(prev_block, block, pp, pn)
-        elif len(pp) > 1 and len(pn) >= len(pp):
-            if pn[:-1] in (pp[:-1], pp):
-                merge_blocks(prev_block, block, pp, pn)
-            # It's possible to have pn starting with pp but having more
-            # than one extra entry, but until I see that in the wild, I'm not
-            # going to bother
-        prev_block = block
+        if pp[0] == pn[0]:
+            merge_blocks(prev_block, block, 0, 0, pn, ldict)
+        else:
+            prev_block = block