From 5d4c4b857e47bff419e06d64f18112b92fee3b6f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 30 Mar 2014 21:21:16 +0530 Subject: [PATCH] Parse INDEX fields --- src/calibre/ebooks/docx/fields.py | 96 ++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py index 24be0cad86..90e80423ff 100644 --- a/src/calibre/ebooks/docx/fields.py +++ b/src/calibre/ebooks/docx/fields.py @@ -37,7 +37,9 @@ scanner = re.Scanner([ null = object() -def parser(name, field_map, default_field_name): +def parser(name, field_map, default_field_name=None): + + field_map = dict((x.split(':') for x in field_map.split())) def parse(raw, log=None): ans = {} @@ -63,10 +65,15 @@ def parser(name, field_map, default_field_name): return parse parse_hyperlink = parser('hyperlink', - {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}, 'url') + 'l:anchor m:image-map n:target o:title t:target', 'url') parse_xe = parser('xe', - {'b':'bold', 'i':'italic', 'f':'entry_type', 'r':'page_range_bookmark', 't':'page_number_text', 'y':'yomi'}, 'text') + 'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text') + +parse_index = parser('index', + 'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator' + ' f:entry-type g:page-range-separator h:heading k:crossref-separator' + ' p:page-number-separator r:run-together y:yomi z:langcode') class Fields(object): @@ -94,38 +101,56 @@ class Fields(object): if stack: stack[-1].contents.append(elem) - # Parse hyperlink fields - self.hyperlink_fields = [] - for field in self.fields: - if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK': - hl = parse_hyperlink(field.instructions[0][1], log) - if hl: - if 'target' in hl and hl['target'] is None: - hl['target'] = '_blank' - all_runs = [] - current_runs = [] - # We only handle spans in a single paragraph - # being wrapped in - for x in field.contents: - if x.tag.endswith('}p'): - if current_runs: - all_runs.append(current_runs) - current_runs = [] - elif x.tag.endswith('}r'): - current_runs.append(x) - if current_runs: - all_runs.append(current_runs) - for runs in all_runs: - self.hyperlink_fields.append((hl, runs)) + field_types = ('hyperlink', 'xe', 'index') + parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types} + field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types} + + for f in field_types: + setattr(self, '%s_fields' % f, []) - # Parse XE fields - self.xe_fields = [] for field in self.fields: - if len(field.instructions) >= 1 and field.instructions[0][0] == 'HYPERLINK': - xe = parse_xe(field.instructions[0][1], log) # TODO: Handle field with multiple instructions - if xe: - # TODO: parse the field contents - self.xe_fields.append(xe) + if field.instructions: + name = field.instructions[0][0] + func = parsers.get(name, None) + if func is not None: + func(field, field_parsers[name], log) + + def parse_hyperlink(self, field, parse_func, log): + # Parse hyperlink fields + if len(field.instructions) == 1: + hl = parse_func(field.instructions[0][1], log) + if hl: + if 'target' in hl and hl['target'] is None: + hl['target'] = '_blank' + all_runs = [] + current_runs = [] + # We only handle spans in a single paragraph + # being wrapped in + for x in field.contents: + if x.tag.endswith('}p'): + if current_runs: + all_runs.append(current_runs) + current_runs = [] + elif x.tag.endswith('}r'): + current_runs.append(x) + if current_runs: + all_runs.append(current_runs) + for runs in all_runs: + self.hyperlink_fields.append((hl, runs)) + + def parse_xe(self, field, parse_func, log): + # Parse XE fields + xe = parse_func(field.instructions[0][1], log) # TODO: Handle field with multiple instructions + if xe: + # TODO: parse the field contents + self.xe_fields.append(xe) + + def parse_index(self, field, parse_func, log): + # Parse Index fields + if len(field.instructions): + idx = parse_func(field.instructions[0][1], log) + # TODO: parse the field contents + self.index_fields.append(idx) def test_parse_fields(): import unittest @@ -146,6 +171,11 @@ def test_parse_fields(): ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None}) ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'}) + def test_index(self): + ae = lambda x, y: self.assertEqual(parse_index(x, None), y) + ae(r'', {}) + ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'}) + suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields) unittest.TextTestRunner(verbosity=4).run(suite)