DOCX Input: Fix handling of hyperlink in documents where the hyperlink text is split up in multiple field components

2025-07-09 03:04:10 -04:00 · 2014-03-31 11:21:33 +05:30 · 2014-03-31 11:21:33 +05:30 · b23cb51d74
commit b23cb51d74
parent 2a9643793a
1 changed files with 36 additions and 33 deletions
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -16,16 +16,21 @@ class Field(object):
        self.start = start
        self.end = None
        self.contents = []
-        self.elements = []
+        self.buf = []
-        self.instructions = []
+        self.instructions = None
        self.name = None
    def add_instr(self, elem):
        raw = elem.text
        if not raw:
            return
-        name, rest = raw.strip().partition(' ')[0::2]
+        if self.name is None:
-        self.instructions.append((name, rest.strip()))
+            self.name, raw = raw.strip().partition(' ')[0::2]
-        self.elements.append(elem)
+        self.buf.append(raw)
    def finalize(self):
        self.instructions = ''.join(self.buf)
        del self.buf
 WORD, FLAG = 0, 1
 scanner = re.Scanner([
@ -109,48 +114,46 @@ class Fields(object):
            setattr(self, '%s_fields' % f, [])
        for field in self.fields:
            field.finalize()
            if field.instructions:
-                name = field.instructions[0][0]
+                func = parsers.get(field.name, None)
                func = parsers.get(name, None)
                if func is not None:
-                    func(field, field_parsers[name], log)
+                    func(field, field_parsers[field.name], log)
    def parse_hyperlink(self, field, parse_func, log):
        # Parse hyperlink fields
-        if len(field.instructions) == 1:
+        hl = parse_func(field.instructions, log)
-            hl = parse_func(field.instructions[0][1], log)
+        if hl:
-            if hl:
+            if 'target' in hl and hl['target'] is None:
-                if 'target' in hl and hl['target'] is None:
+                hl['target'] = '_blank'
-                    hl['target'] = '_blank'
+            all_runs = []
-                all_runs = []
+            current_runs = []
-                current_runs = []
+            # We only handle spans in a single paragraph
-                # We only handle spans in a single paragraph
+            # being wrapped in <a>
-                # being wrapped in <a>
+            for x in field.contents:
-                for x in field.contents:
+                if x.tag.endswith('}p'):
-                    if x.tag.endswith('}p'):
+                    if current_runs:
-                        if current_runs:
+                        all_runs.append(current_runs)
-                            all_runs.append(current_runs)
+                    current_runs = []
-                        current_runs = []
+                elif x.tag.endswith('}r'):
-                    elif x.tag.endswith('}r'):
+                    current_runs.append(x)
-                        current_runs.append(x)
+            if current_runs:
-                if current_runs:
+                all_runs.append(current_runs)
-                    all_runs.append(current_runs)
+            for runs in all_runs:
-                for runs in all_runs:
+                self.hyperlink_fields.append((hl, runs))
                    self.hyperlink_fields.append((hl, runs))
    def parse_xe(self, field, parse_func, log):
        # Parse XE fields
-        xe = parse_func(field.instructions[0][1], log)  # TODO: Handle field with multiple instructions
+        xe = parse_func(field.instructions, log)  # TODO: Handle field with multiple instructions
        if xe:
            # TODO: parse the field contents
            self.xe_fields.append(xe)
    def parse_index(self, field, parse_func, log):
        # Parse Index fields
-        if len(field.instructions):
+        idx = parse_func(field.instructions, log)
-            idx = parse_func(field.instructions[0][1], log)
+        # TODO: parse the field contents
-            # TODO: parse the field contents
+        self.index_fields.append(idx)
            self.index_fields.append(idx)
 def test_parse_fields():
    import unittest