mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	DOCX Input: Fix formatting of the generated Index when the index is complex, i.e. with lots of references to the same item, multiple level of sub-items, etc.
Merge branch 'master' of https://github.com/pgarst/calibre
This commit is contained in:
		
						commit
						1f79b14973
					
				@ -119,7 +119,7 @@ def process_index(field, index, xe_fields, log):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    return hyperlinks, blocks
 | 
					    return hyperlinks, blocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def split_up_block(block, a, text, parts):
 | 
					def split_up_block(block, a, text, parts, ldict):
 | 
				
			||||||
    prefix = parts[:-1]
 | 
					    prefix = parts[:-1]
 | 
				
			||||||
    a.text = parts[-1]
 | 
					    a.text = parts[-1]
 | 
				
			||||||
    parent = a.getparent()
 | 
					    parent = a.getparent()
 | 
				
			||||||
@ -127,31 +127,100 @@ def split_up_block(block, a, text, parts):
 | 
				
			|||||||
    for i, prefix in enumerate(prefix):
 | 
					    for i, prefix in enumerate(prefix):
 | 
				
			||||||
        m = 1.5 * i
 | 
					        m = 1.5 * i
 | 
				
			||||||
        span = parent.makeelement('span', style=style % m)
 | 
					        span = parent.makeelement('span', style=style % m)
 | 
				
			||||||
 | 
					        ldict[span]    = i
 | 
				
			||||||
        parent.append(span)
 | 
					        parent.append(span)
 | 
				
			||||||
        span.text = prefix
 | 
					        span.text = prefix
 | 
				
			||||||
    span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
 | 
					    span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
 | 
				
			||||||
    parent.append(span)
 | 
					    parent.append(span)
 | 
				
			||||||
    span.append(a)
 | 
					    span.append(a)
 | 
				
			||||||
 | 
					    ldict[span]    = len(prefix)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def merge_blocks(prev_block, next_block, prev_path, next_path):
 | 
					"""
 | 
				
			||||||
    pa, na = prev_block.xpath('descendant::a'), next_block.xpath('descendant::a[1]')
 | 
					The merge algorithm is a little tricky.
 | 
				
			||||||
    if not pa or not na:
 | 
					We start with a list of elementary blocks. Each is an HtmlElement, a p node
 | 
				
			||||||
        return
 | 
					with a list of child nodes. The last child is a link, and the earlier ones are 
 | 
				
			||||||
    pa, na = pa[-1], na[0]
 | 
					just text.
 | 
				
			||||||
    if prev_path == next_path:
 | 
					The list is in reverse order from what we want in the index.
 | 
				
			||||||
 | 
					There is a dictionary ldict which records the level of each child node.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Now we want to do a reduce-like operation, combining all blocks with the same
 | 
				
			||||||
 | 
					top level index entry into a single block representing the structure of all
 | 
				
			||||||
 | 
					references, subentries, etc. under that top entry.
 | 
				
			||||||
 | 
					Here's the algorithm.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Given a block p and the next block n, and the top level entries p1 and n1 in each
 | 
				
			||||||
 | 
					block, which we assume have the same text:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Start with (p, p1) and (n, n1).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If there are no more levels in n, then add the link from nk to the links for pk.
 | 
				
			||||||
 | 
					This might be the first link for pk, or we might get a list of references.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
 | 
				
			||||||
 | 
					the same text, it must follow pk, it must come before we find any other p entries at 
 | 
				
			||||||
 | 
					the same level as pk, and it must have the same level as nk+1.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If there is no matching entry, then because of the original reversed order we want
 | 
				
			||||||
 | 
					to insert nk+1 and all following entries from n into p immediately following pk.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def find_match(prev_block, pind, nextent, ldict):
 | 
				
			||||||
 | 
					    curlevel = ldict[prev_block[pind]]
 | 
				
			||||||
 | 
					    for p in range(pind+1, len(prev_block)):
 | 
				
			||||||
 | 
					        trylev = ldict[prev_block[p]]
 | 
				
			||||||
 | 
					        if trylev <= curlevel:
 | 
				
			||||||
 | 
					            return -1
 | 
				
			||||||
 | 
					        if trylev > (curlevel+1):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if prev_block[p].text_content() == nextent.text_content():
 | 
				
			||||||
 | 
					            return p
 | 
				
			||||||
 | 
					    return -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def add_link(pent, nent, ldict):
 | 
				
			||||||
 | 
					    na = nent.xpath('descendant::a[1]')
 | 
				
			||||||
 | 
					    na = na[0]
 | 
				
			||||||
 | 
					    pa = pent.xpath('descendant::a')
 | 
				
			||||||
 | 
					    if pa and len(pa) > 0:
 | 
				
			||||||
        # Put on same line with a comma
 | 
					        # Put on same line with a comma
 | 
				
			||||||
 | 
					        pa = pa[-1]
 | 
				
			||||||
        pa.tail = ', '
 | 
					        pa.tail = ', '
 | 
				
			||||||
        p = pa.getparent()
 | 
					        p = pa.getparent()
 | 
				
			||||||
        p.insert(p.index(pa) + 1, na)
 | 
					        p.insert(p.index(pa) + 1, na)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Add a line to the previous block
 | 
					        # substitute link na for plain text in pent
 | 
				
			||||||
        ps, ns = pa.getparent(), na.getparent()
 | 
					        pent.text = ""
 | 
				
			||||||
        p = ps.getparent()
 | 
					        pent.append(na)
 | 
				
			||||||
        p.insert(p.index(ps) + 1, ns)
 | 
					
 | 
				
			||||||
 | 
					def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
 | 
				
			||||||
 | 
					    # First elements match. Any more in next?
 | 
				
			||||||
 | 
					    if len(next_path) == (nind + 1):
 | 
				
			||||||
 | 
					        nextent = next_block[nind]
 | 
				
			||||||
 | 
					        add_link(prev_block[pind], nextent, ldict)
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nind = nind + 1
 | 
				
			||||||
 | 
					    nextent = next_block[nind]
 | 
				
			||||||
 | 
					    prevent = find_match(prev_block, pind, nextent, ldict)
 | 
				
			||||||
 | 
					    if prevent > 0:
 | 
				
			||||||
 | 
					        merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Want to insert elements into previous block
 | 
				
			||||||
 | 
					    while nind < len(next_block):
 | 
				
			||||||
 | 
					        # insert takes it out of old
 | 
				
			||||||
 | 
					        pind = pind + 1
 | 
				
			||||||
 | 
					        prev_block.insert(pind, next_block[nind])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    next_block.getparent().remove(next_block)
 | 
					    next_block.getparent().remove(next_block)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def polish_index_markup(index, blocks):
 | 
					def polish_index_markup(index, blocks):
 | 
				
			||||||
 | 
					    # Blocks are in reverse order at this point
 | 
				
			||||||
    path_map = {}
 | 
					    path_map = {}
 | 
				
			||||||
 | 
					    ldict = {}
 | 
				
			||||||
    for block in blocks:
 | 
					    for block in blocks:
 | 
				
			||||||
        cls = block.get('class', '') or ''
 | 
					        cls = block.get('class', '') or ''
 | 
				
			||||||
        block.set('class', (cls + ' index-entry').lstrip())
 | 
					        block.set('class', (cls + ' index-entry').lstrip())
 | 
				
			||||||
@ -162,20 +231,22 @@ def polish_index_markup(index, blocks):
 | 
				
			|||||||
        if ':' in text:
 | 
					        if ':' in text:
 | 
				
			||||||
            path_map[block] = parts = filter(None, (x.strip() for x in text.split(':')))
 | 
					            path_map[block] = parts = filter(None, (x.strip() for x in text.split(':')))
 | 
				
			||||||
            if len(parts) > 1:
 | 
					            if len(parts) > 1:
 | 
				
			||||||
                split_up_block(block, a[0], text, parts)
 | 
					                split_up_block(block, a[0], text, parts, ldict)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
 | 
					            # try using a span all the time
 | 
				
			||||||
            path_map[block] = [text]
 | 
					            path_map[block] = [text]
 | 
				
			||||||
 | 
					            parent = a[0].getparent()
 | 
				
			||||||
 | 
					            span = parent.makeelement('span', style='display:block; margin-left: 0em')
 | 
				
			||||||
 | 
					            parent.append(span)
 | 
				
			||||||
 | 
					            span.append(a[0])
 | 
				
			||||||
 | 
					            ldict[span] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # We want a single block for each main entry
 | 
				
			||||||
    prev_block = blocks[0]
 | 
					    prev_block = blocks[0]
 | 
				
			||||||
    for block in blocks[1:]:
 | 
					    for block in blocks[1:]:
 | 
				
			||||||
        pp, pn = path_map[prev_block], path_map[block]
 | 
					        pp, pn = path_map[prev_block], path_map[block]
 | 
				
			||||||
        if pp == pn:
 | 
					        if pp[0] == pn[0]:
 | 
				
			||||||
            merge_blocks(prev_block, block, pp, pn)
 | 
					            merge_blocks(prev_block, block, 0, 0, pn, ldict)
 | 
				
			||||||
        elif len(pp) > 1 and len(pn) >= len(pp):
 | 
					        else:
 | 
				
			||||||
            if pn[:-1] in (pp[:-1], pp):
 | 
					            prev_block = block
 | 
				
			||||||
                merge_blocks(prev_block, block, pp, pn)
 | 
					 | 
				
			||||||
            # It's possible to have pn starting with pp but having more
 | 
					 | 
				
			||||||
            # than one extra entry, but until I see that in the wild, I'm not
 | 
					 | 
				
			||||||
            # going to bother
 | 
					 | 
				
			||||||
        prev_block = block
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user