From 97b222caca71dfa08d261dcfbf6234010aadbfae Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 May 2014 20:50:34 +0530 Subject: [PATCH] DOCX Input: Handle docx files with index fields that have their field names incorrectly lower cased. Fixes #1318670 [Conversion from DOCX, probably indexitem related](https://bugs.launchpad.net/calibre/+bug/1318670) --- src/calibre/ebooks/docx/fields.py | 8 ++++++++ src/calibre/ebooks/docx/index.py | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 6c4a74d37e..d43bc89ce8 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -126,10 +126,13 @@ class Fields(object): field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref') parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types} + parsers.update({x:getattr(self, 'parse_'+x) for x in field_types}) field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types} + field_parsers.update({f:globals()['parse_%s' % f] for f in field_types}) for f in field_types: setattr(self, '%s_fields' % f, []) + unknown_fields = {'TOC', 'toc', 'PAGEREF', 'pageref'} # The TOC and PAGEREF fields are handled separately for field in self.fields: field.finalize() @@ -137,6 +140,9 @@ class Fields(object): func = parsers.get(field.name, None) if func is not None: func(field, field_parsers[field.name], log) + elif field.name not in unknown_fields: + log.warn('Encountered unknown field: %s, ignoring it.' % field.name) + unknown_fields.add(field.name) def get_runs(self, field): all_runs = [] @@ -200,6 +206,8 @@ class Fields(object): return idx = parse_func(field.instructions, log) hyperlinks, blocks = process_index(field, idx, self.xe_fields, log) + if not blocks: + return for anchor, run in hyperlinks: self.hyperlink_fields.append(({'anchor':anchor}, [run])) diff --git a/src/calibre/ebooks/docx/index.py b/src/calibre/ebooks/docx/index.py index 6cac4f0165..a299fc5042 100644 --- a/src/calibre/ebooks/docx/index.py +++ b/src/calibre/ebooks/docx/index.py @@ -91,7 +91,7 @@ def process_index(field, index, xe_fields, log): xe_fields = get_applicable_xe_fields(index, xe_fields) if not xe_fields: - return + return [], [] if heading_text is not None: groups = partition_by_first_letter(xe_fields, key=itemgetter('text')) items = [] @@ -138,7 +138,7 @@ def split_up_block(block, a, text, parts, ldict): """ The merge algorithm is a little tricky. We start with a list of elementary blocks. Each is an HtmlElement, a p node -with a list of child nodes. The last child is a link, and the earlier ones are +with a list of child nodes. The last child is a link, and the earlier ones are just text. The list is in reverse order from what we want in the index. There is a dictionary ldict which records the level of each child node. @@ -159,7 +159,7 @@ If there are no more levels in n, then add the link from nk to the links for pk. This might be the first link for pk, or we might get a list of references. Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have -the same text, it must follow pk, it must come before we find any other p entries at +the same text, it must follow pk, it must come before we find any other p entries at the same level as pk, and it must have the same level as nk+1. If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1). @@ -208,7 +208,7 @@ def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict): if prevent > 0: merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict) return - + # Want to insert elements into previous block while nind < len(next_block): # insert takes it out of old