DOCX: Hyperlinks

This commit is contained in:
Kovid Goyal 2013-05-22 18:36:29 +05:30
parent fd67eccb88
commit e993b1e0d4
3 changed files with 44 additions and 4 deletions

View File

@ -181,7 +181,9 @@ class DOCX(object):
else: else:
root = fromstring(raw) root = fromstring(raw)
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = '/'.join((base, item.get('Target').lstrip('/'))) target = item.get('Target')
if item.get('TargetMode', None) != 'External':
target = '/'.join((base, target.lstrip('/')))
typ = item.get('Type') typ = item.get('Type')
Id = item.get('Id') Id = item.get('Id')
by_id[Id] = by_type[typ] = target by_id[Id] = by_type[typ] = target

View File

@ -20,6 +20,7 @@ STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable' FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'
namespaces = { namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re import sys, os, re
from collections import OrderedDict from collections import OrderedDict, defaultdict
from lxml import html from lxml import html
from lxml.html.builder import ( from lxml.html.builder import (
@ -16,7 +16,7 @@ from lxml.html.builder import (
from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import ( from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants) descendants, ancestor)
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts from calibre.ebooks.docx.fonts import Fonts
@ -73,12 +73,15 @@ class Convert(object):
self.framed = [[]] self.framed = [[]]
self.framed_map = {} self.framed_map = {}
self.anchor_map = {} self.anchor_map = {}
self.link_map = defaultdict(list)
self.read_page_properties(doc) self.read_page_properties(doc)
for wp, page_properties in self.page_map.iteritems(): for wp, page_properties in self.page_map.iteritems():
self.current_page = page_properties self.current_page = page_properties
p = self.convert_p(wp) p = self.convert_p(wp)
self.body.append(p) self.body.append(p)
self.resolve_links(relationships_by_id)
# TODO: tables <w:tbl> child of <w:body> (nested tables?) # TODO: tables <w:tbl> child of <w:body> (nested tables?)
self.styles.cascade(self.layers) self.styles.cascade(self.layers)
@ -198,19 +201,28 @@ class Convert(object):
self.add_frame(dest, style.frame) self.add_frame(dest, style.frame)
current_anchor = None current_anchor = None
current_hyperlink = None
for x in descendants(p, 'w:r', 'w:bookmarkStart'): for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
if x.tag.endswith('}r'): if x.tag.endswith('}r'):
span = self.convert_run(x) span = self.convert_run(x)
if current_anchor is not None: if current_anchor is not None:
(dest if len(dest) == 0 else span).set('id', current_anchor) (dest if len(dest) == 0 else span).set('id', current_anchor)
current_anchor = None current_anchor = None
if current_hyperlink is not None:
hl = ancestor(x, 'w:hyperlink')
if hl is not None:
self.link_map[hl].append(span)
else:
current_hyperlink = None
dest.append(span) dest.append(span)
self.layers[p].append(x) self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'): elif x.tag.endswith('}bookmarkStart'):
anchor = get(x, 'w:name') anchor = get(x, 'w:name')
if anchor and anchor not in self.anchor_map: if anchor and anchor not in self.anchor_map:
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
elif x.tag.endswith('}hyperlink'):
current_hyperlink = x
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
if m is not None: if m is not None:
@ -255,6 +267,31 @@ class Convert(object):
for elem in elems: for elem in elems:
p.remove(elem) p.remove(elem)
wrapper.append(elem) wrapper.append(elem)
return wrapper
def resolve_links(self, relationships_by_id):
for hyperlink, spans in self.link_map.iteritems():
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
tgt = get(hyperlink, 'w:tgtFrame')
if tgt:
span.set('target', tgt)
tt = get(hyperlink, 'w:tooltip')
if tt:
span.set('title', tt)
rid = get(hyperlink, 'r:id')
if rid and rid in relationships_by_id:
span.set('href', relationships_by_id[rid])
continue
anchor = get(hyperlink, 'w:anchor')
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink with unknown target (%s, %s), ignoring' %
(rid, anchor))
span.set('href', '#')
def convert_run(self, run): def convert_run(self, run):
ans = SPAN() ans = SPAN()