diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index bcca336474..95ce6e4f9b 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -181,7 +181,9 @@ class DOCX(object): else: root = fromstring(raw) for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): - target = '/'.join((base, item.get('Target').lstrip('/'))) + target = item.get('Target') + if item.get('TargetMode', None) != 'External': + target = '/'.join((base, target.lstrip('/'))) typ = item.get('Type') Id = item.get('Id') by_id[Id] = by_type[typ] = target diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 3d365902e7..4e9c34cfc7 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -20,6 +20,7 @@ STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable' IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' +LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 64d1d3f795..d541f7a541 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' import sys, os, re -from collections import OrderedDict +from collections import OrderedDict, defaultdict from lxml import html from lxml.html.builder import ( @@ -16,7 +16,7 @@ from lxml.html.builder import ( from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.names import ( XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, - descendants) + descendants, ancestor) from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.fonts import Fonts @@ -73,12 +73,15 @@ class Convert(object): self.framed = [[]] self.framed_map = {} self.anchor_map = {} + self.link_map = defaultdict(list) self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties p = self.convert_p(wp) self.body.append(p) + + self.resolve_links(relationships_by_id) # TODO: tables child of (nested tables?) self.styles.cascade(self.layers) @@ -198,19 +201,28 @@ class Convert(object): self.add_frame(dest, style.frame) current_anchor = None + current_hyperlink = None - for x in descendants(p, 'w:r', 'w:bookmarkStart'): + for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None + if current_hyperlink is not None: + hl = ancestor(x, 'w:hyperlink') + if hl is not None: + self.link_map[hl].append(span) + else: + current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) + elif x.tag.endswith('}hyperlink'): + current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: @@ -255,6 +267,31 @@ class Convert(object): for elem in elems: p.remove(elem) wrapper.append(elem) + return wrapper + + def resolve_links(self, relationships_by_id): + for hyperlink, spans in self.link_map.iteritems(): + span = spans[0] + if len(spans) > 1: + span = self.wrap_elems(spans, SPAN()) + span.tag = 'a' + tgt = get(hyperlink, 'w:tgtFrame') + if tgt: + span.set('target', tgt) + tt = get(hyperlink, 'w:tooltip') + if tt: + span.set('title', tt) + rid = get(hyperlink, 'r:id') + if rid and rid in relationships_by_id: + span.set('href', relationships_by_id[rid]) + continue + anchor = get(hyperlink, 'w:anchor') + if anchor and anchor in self.anchor_map: + span.set('href', '#' + self.anchor_map[anchor]) + continue + self.log.warn('Hyperlink with unknown target (%s, %s), ignoring' % + (rid, anchor)) + span.set('href', '#') def convert_run(self, run): ans = SPAN()