DOCX: Handle hyperlinks created as fields

See https://bugs.launchpad.net/calibre/+bug/1196728 for an example.
2025-07-09 03:04:10 -04:00 · 2013-07-03 10:58:48 +05:30 · 2013-07-03 10:58:48 +05:30 · 3b4094a890
commit 3b4094a890
parent e8839bc8dc
2 changed files with 128 additions and 0 deletions
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -0,0 +1,106 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import re
 from calibre.ebooks.docx.names import XPath, get
 class Field(object):
    def __init__(self, start):
        self.start = start
        self.end = None
        self.contents = []
        self.instructions = []
    def add_instr(self, elem):
        raw = elem.text
        if not raw:
            return
        name, rest = raw.strip().partition(' ')[0::2]
        self.instructions.append((name, rest.strip()))
 WORD, FLAG = 0, 1
 scanner = re.Scanner([
    (r'\\\S{1}', lambda s, t: (t, FLAG)),  # A flag of the form \x
    (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)),  # Quoted word
    (r'[^\s\\"]\S*', lambda s, t: (t, WORD)),  # A non-quoted word, must not start with a backslash or a space or a quote
    (r'\s+', None),
 ], flags=re.DOTALL)
 def parse_hyperlink(raw, log):
    ans = {}
    last_option = None
    for token, token_type in scanner.scan(raw)[0]:
        if not ans:
            if token_type is not WORD:
                log('Invalid hyperlink, first token is not a URL (%s)' % raw)
                return ans
            ans['url'] = token
        if token_type is FLAG:
            last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
            if last_option is not None:
                ans[last_option] = None
        elif token_type is WORD:
            if last_option is not None:
                ans[last_option] = token
    return ans
 class Fields(object):
    def __init__(self):
        self.fields = []
    def __call__(self, doc, log):
        stack = []
        for elem in XPath(
            '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc):
            if elem.tag.endswith('}fldChar'):
                typ = get(elem, 'w:fldCharType')
                if typ == 'begin':
                    stack.append(Field(elem))
                    self.fields.append(stack[-1])
                else:
                    try:
                        stack.pop().end = elem
                    except IndexError:
                        pass
            elif elem.tag.endswith('}instrText'):
                if stack:
                    stack[-1].add_instr(elem)
            else:
                if stack:
                    stack[-1].contents.append(elem)
        # Parse hyperlink fields
        self.hyperlink_fields = []
        for field in self.fields:
            if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
                hl = parse_hyperlink(field.instructions[0][1], log)
                if hl:
                    if 'target' in hl and hl['target'] is None:
                        hl['target'] = '_blank'
                    all_runs = []
                    current_runs = []
                    # We only handle spans in a single paragraph
                    # being wrapped in <a>
                    for x in field.contents:
                        if x.tag.endswith('}p'):
                            if current_runs:
                                all_runs.append(current_runs)
                            current_runs = []
                        elif x.tag.endswith('}r'):
                            current_runs.append(x)
                    if current_runs:
                        all_runs.append(current_runs)
                    for runs in all_runs:
                        self.hyperlink_fields.append((hl, runs))
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -26,6 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes
 from calibre.ebooks.docx.cleanup import cleanup_markup
 from calibre.ebooks.docx.theme import Theme
 from calibre.ebooks.docx.toc import create_toc
 from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -52,6 +53,7 @@ class Convert(object):
        self.body = BODY()
        self.theme = Theme()
        self.tables = Tables()
        self.fields = Fields()
        self.styles = Styles(self.tables)
        self.images = Images()
        self.object_map = OrderedDict()
@ -79,6 +81,7 @@ class Convert(object):
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
@ -396,6 +399,25 @@ class Convert(object):
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v:k for k, v in self.object_map.iteritems()}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink['url']
            if url in self.anchor_map:
                span.set('href', '#' + self.anchor_map[url])
                continue
            span.set('href', url)
    def convert_run(self, run):
        ans = SPAN()