DOCX Input: URL less hyperlink fields and stacked block level bookmarks

DOCX Input: Add support for hyperlink fields that have only anchors and not URLs DOCX Input: Fix handling of multiple block level bookmarks at the same location. Fixes #1241451 [links are not working](https://bugs.launchpad.net/calibre/+bug/1241451)
2025-07-31 14:33:54 -04:00 · 2013-10-22 15:46:31 +05:30 · 2013-10-22 15:46:31 +05:30 · e260906211
commit e260906211
parent 93c872e8f3
2 changed files with 40 additions and 16 deletions
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -40,18 +40,16 @@ def parse_hyperlink(raw, log):
    raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
    for token, token_type in scanner.scan(raw)[0]:
        token = token.replace('\x01', '\\').replace('\x02', '"')
-        if not ans:
-            if token_type is not WORD:
-                log('Invalid hyperlink, first token is not a URL (%s)' % raw)
-                return ans
-            ans['url'] = token
        if token_type is FLAG:
            last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
            if last_option is not None:
                ans[last_option] = None
        elif token_type is WORD:
-            if last_option is not None:
+            if last_option is None:
+                ans['url'] = token
+            else:
                ans[last_option] = token
+                last_option = None
    return ans


@ -105,4 +103,20 @@ class Fields(object):
                    for runs in all_runs:
                        self.hyperlink_fields.append((hl, runs))

+def test_parse_hyperlink():
+    import unittest

+    class TestParseHyperLink(unittest.TestCase):
+
+        def test_parsing(self):
+            self.assertEqual(parse_hyperlink(
+                r'\l anchor1', None), {'anchor':'anchor1'})
+            self.assertEqual(parse_hyperlink(
+                r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'})
+            self.assertEqual(parse_hyperlink(
+                r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
+            self.assertEqual(parse_hyperlink(
+                r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'})
+
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink)
+    unittest.TextTestRunner(verbosity=4).run(suite)
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -346,18 +346,21 @@ class Convert(object):
    def read_block_anchors(self, doc):
        doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
-            current_bm = None
+            current_bm = set()
            rmap = {v:k for k, v in self.object_map.iteritems()}
            for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
-                            para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues())))
-                        self.anchor_map[current_bm] = para.get('id')
-                        current_bm = None
+                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.itervalues())))
+                        for name in current_bm:
+                            self.anchor_map[name] = para.get('id')
+                        current_bm = set()
                elif p in doc_anchors:
-                    current_bm = get(p, 'w:name')
+                    anchor = get(p, 'w:name')
+                    if anchor:
+                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
@ -500,11 +503,18 @@ class Convert(object):
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
-            url = hyperlink['url']
-            if url in self.anchor_map:
-                span.set('href', '#' + self.anchor_map[url])
-                continue
-            span.set('href', url)
+            url = hyperlink.get('url', None)
+            if url is None:
+                anchor = hyperlink.get('anchor', None)
+                if anchor in self.anchor_map:
+                    span.set('href', '#' + self.anchor_map[anchor])
+                    continue
+                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
+            else:
+                if url in self.anchor_map:
+                    span.set('href', '#' + self.anchor_map[url])
+                    continue
+                span.set('href', url)

        for img, link in self.images.links:
            parent = img.getparent()