DOCX Input: Fix handling of hyperlink in documents where the hyperlink text is split up in multiple field components

This commit is contained in:
Kovid Goyal 2014-03-31 11:21:33 +05:30
parent 2a9643793a
commit b23cb51d74

View File

@ -16,16 +16,21 @@ class Field(object):
self.start = start self.start = start
self.end = None self.end = None
self.contents = [] self.contents = []
self.elements = [] self.buf = []
self.instructions = [] self.instructions = None
self.name = None
def add_instr(self, elem): def add_instr(self, elem):
raw = elem.text raw = elem.text
if not raw: if not raw:
return return
name, rest = raw.strip().partition(' ')[0::2] if self.name is None:
self.instructions.append((name, rest.strip())) self.name, raw = raw.strip().partition(' ')[0::2]
self.elements.append(elem) self.buf.append(raw)
def finalize(self):
self.instructions = ''.join(self.buf)
del self.buf
WORD, FLAG = 0, 1 WORD, FLAG = 0, 1
scanner = re.Scanner([ scanner = re.Scanner([
@ -109,48 +114,46 @@ class Fields(object):
setattr(self, '%s_fields' % f, []) setattr(self, '%s_fields' % f, [])
for field in self.fields: for field in self.fields:
field.finalize()
if field.instructions: if field.instructions:
name = field.instructions[0][0] func = parsers.get(field.name, None)
func = parsers.get(name, None)
if func is not None: if func is not None:
func(field, field_parsers[name], log) func(field, field_parsers[field.name], log)
def parse_hyperlink(self, field, parse_func, log): def parse_hyperlink(self, field, parse_func, log):
# Parse hyperlink fields # Parse hyperlink fields
if len(field.instructions) == 1: hl = parse_func(field.instructions, log)
hl = parse_func(field.instructions[0][1], log) if hl:
if hl: if 'target' in hl and hl['target'] is None:
if 'target' in hl and hl['target'] is None: hl['target'] = '_blank'
hl['target'] = '_blank' all_runs = []
all_runs = [] current_runs = []
current_runs = [] # We only handle spans in a single paragraph
# We only handle spans in a single paragraph # being wrapped in <a>
# being wrapped in <a> for x in field.contents:
for x in field.contents: if x.tag.endswith('}p'):
if x.tag.endswith('}p'): if current_runs:
if current_runs: all_runs.append(current_runs)
all_runs.append(current_runs) current_runs = []
current_runs = [] elif x.tag.endswith('}r'):
elif x.tag.endswith('}r'): current_runs.append(x)
current_runs.append(x) if current_runs:
if current_runs: all_runs.append(current_runs)
all_runs.append(current_runs) for runs in all_runs:
for runs in all_runs: self.hyperlink_fields.append((hl, runs))
self.hyperlink_fields.append((hl, runs))
def parse_xe(self, field, parse_func, log): def parse_xe(self, field, parse_func, log):
# Parse XE fields # Parse XE fields
xe = parse_func(field.instructions[0][1], log) # TODO: Handle field with multiple instructions xe = parse_func(field.instructions, log) # TODO: Handle field with multiple instructions
if xe: if xe:
# TODO: parse the field contents # TODO: parse the field contents
self.xe_fields.append(xe) self.xe_fields.append(xe)
def parse_index(self, field, parse_func, log): def parse_index(self, field, parse_func, log):
# Parse Index fields # Parse Index fields
if len(field.instructions): idx = parse_func(field.instructions, log)
idx = parse_func(field.instructions[0][1], log) # TODO: parse the field contents
# TODO: parse the field contents self.index_fields.append(idx)
self.index_fields.append(idx)
def test_parse_fields(): def test_parse_fields():
import unittest import unittest