DOCX Output: Implement hyperlinks

This commit is contained in:
Kovid Goyal 2015-04-29 15:54:04 +05:30
parent 7c65e3cd09
commit 2160cbaecf
6 changed files with 164 additions and 37 deletions

View File

@ -1,5 +1,4 @@
Table of Contents Table of Contents
Links
<hr> tag (probably as an empty block with a border) <hr> tag (probably as an empty block with a border)
Various TODOs sprinkled through the source Various TODOs sprinkled through the source
List image bullet List image bullet

View File

@ -10,6 +10,7 @@ import re
from calibre.ebooks.docx.writer.container import create_skeleton from calibre.ebooks.docx.writer.container import create_skeleton
from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec from calibre.ebooks.docx.writer.styles import StylesManager, FloatSpec
from calibre.ebooks.docx.writer.links import LinksManager
from calibre.ebooks.docx.writer.images import ImagesManager from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.docx.writer.fonts import FontsManager from calibre.ebooks.docx.writer.fonts import FontsManager
from calibre.ebooks.docx.writer.tables import Table from calibre.ebooks.docx.writer.tables import Table
@ -51,30 +52,36 @@ class TextRun(object):
TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') TextRun.ws_pat = self.ws_pat = re.compile(r'\s+')
self.style = style self.style = style
self.texts = [] self.texts = []
self.link = None
self.makelement = namespace.makeelement self.makelement = namespace.makeelement
def add_text(self, text, preserve_whitespace): def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
if not preserve_whitespace: if not preserve_whitespace:
text = self.ws_pat.sub(' ', text) text = self.ws_pat.sub(' ', text)
if text.strip() != text: if text.strip() != text:
# If preserve_whitespace is False, Word ignores leading and # If preserve_whitespace is False, Word ignores leading and
# trailing whitespace # trailing whitespace
preserve_whitespace = True preserve_whitespace = True
self.texts.append((text, preserve_whitespace)) self.texts.append((text, preserve_whitespace, bookmark))
self.link = link
def add_break(self, clear='none'): def add_break(self, clear='none', bookmark=None):
self.texts.append((None, clear)) self.texts.append((None, clear, bookmark))
def add_image(self, drawing): def add_image(self, drawing, bookmark=None):
self.texts.append((drawing, None)) self.texts.append((drawing, None, bookmark))
def serialize(self, p): def serialize(self, p, links_manager):
makeelement = self.makelement makeelement = self.makelement
r = makeelement(p, 'w:r') parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
r = makeelement(parent, 'w:r')
rpr = makeelement(r, 'w:rPr') rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:rStyle', w_val=self.style.id) makeelement(rpr, 'w:rStyle', w_val=self.style.id)
for text, preserve_whitespace in self.texts: for text, preserve_whitespace, bookmark in self.texts:
if bookmark is not None:
bid = links_manager.bookmark_id
makeelement(r, 'w:bookmarkStart', w_id=str(bid), w_name=bookmark)
if text is None: if text is None:
makeelement(r, 'w:br', w_clear=preserve_whitespace) makeelement(r, 'w:br', w_clear=preserve_whitespace)
elif hasattr(text, 'xpath'): elif hasattr(text, 'xpath'):
@ -84,6 +91,8 @@ class TextRun(object):
t.text = text or '' t.text = text or ''
if preserve_whitespace: if preserve_whitespace:
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
if bookmark is not None:
makeelement(r, 'w:bookmarkEnd', w_id=str(bid))
def __repr__(self): def __repr__(self):
return repr(self.texts) return repr(self.texts)
@ -91,14 +100,15 @@ class TextRun(object):
def is_empty(self): def is_empty(self):
if not self.texts: if not self.texts:
return True return True
if len(self.texts) == 1 and self.texts[0] == ('', False): if len(self.texts) == 1 and self.texts[0][:2] == ('', False):
return True return True
return False return False
class Block(object): class Block(object):
def __init__(self, namespace, styles_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False): def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
self.namespace = namespace self.namespace = namespace
self.bookmarks = set()
self.list_tag = (html_block, style) if is_list_item else None self.list_tag = (html_block, style) if is_list_item else None
self.numbering_id = None self.numbering_id = None
self.parent_items = None self.parent_items = None
@ -108,7 +118,7 @@ class Block(object):
float_spec.blocks.append(self) float_spec.blocks.append(self)
self.html_style = style self.html_style = style
self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell) self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell)
self.styles_manager = styles_manager self.styles_manager, self.links_manager = styles_manager, links_manager
self.keep_next = False self.keep_next = False
self.page_break_before = False self.page_break_before = False
self.runs = [] self.runs = []
@ -122,10 +132,10 @@ class Block(object):
if self.list_tag is not None: if self.list_tag is not None:
next_block.list_tag = self.list_tag next_block.list_tag = self.list_tag
def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False): def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None):
ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style) ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style)
ws = style['white-space'] ws = style['white-space']
if self.runs and ts == self.runs[-1].style: if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent) run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent)
@ -135,30 +145,35 @@ class Block(object):
text = text.lstrip() text = text.lstrip()
if ws == 'pre-line': if ws == 'pre-line':
for text in text.splitlines(): for text in text.splitlines():
run.add_text(text, False) run.add_text(text, False, bookmark=bookmark, link=link)
bookmark = None
run.add_break() run.add_break()
else: else:
run.add_text(text, preserve_whitespace) run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link)
def add_break(self, clear='none'): def add_break(self, clear='none', bookmark=None):
if self.runs: if self.runs:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block) run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run) self.runs.append(run)
run.add_break(clear=clear) run.add_break(clear=clear, bookmark=bookmark)
def add_image(self, drawing): def add_image(self, drawing, bookmark=None):
if self.runs: if self.runs:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block) run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run) self.runs.append(run)
run.add_image(drawing) run.add_image(drawing, bookmark=bookmark)
def serialize(self, body): def serialize(self, body):
makeelement = self.namespace.makeelement makeelement = self.namespace.makeelement
p = makeelement(body, 'w:p') p = makeelement(body, 'w:p')
end_bookmarks = []
for bmark in self.bookmarks:
end_bookmarks.append(str(self.links_manager.bookmark_id))
makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark)
ppr = makeelement(p, 'w:pPr') ppr = makeelement(p, 'w:pPr')
if self.keep_next: if self.keep_next:
makeelement(ppr, 'w:keepNext') makeelement(ppr, 'w:keepNext')
@ -172,7 +187,9 @@ class Block(object):
makeelement(numpr, 'w:numId', w_val=str(self.numbering_id[0])) makeelement(numpr, 'w:numId', w_val=str(self.numbering_id[0]))
makeelement(ppr, 'w:pStyle', w_val=self.style.id) makeelement(ppr, 'w:pStyle', w_val=self.style.id)
for run in self.runs: for run in self.runs:
run.serialize(p) run.serialize(p, self.links_manager)
for bmark in end_bookmarks:
makeelement(p, 'w:bookmarkEnd', w_id=bmark)
def __repr__(self): def __repr__(self):
return 'Block(%r)' % self.runs return 'Block(%r)' % self.runs
@ -185,9 +202,10 @@ class Block(object):
class Blocks(object): class Blocks(object):
def __init__(self, namespace, styles_manager): def __init__(self, namespace, styles_manager, links_manager):
self.namespace = namespace self.namespace = namespace
self.styles_manager = styles_manager self.styles_manager = styles_manager
self.links_manager = links_manager
self.all_blocks = [] self.all_blocks = []
self.pos = 0 self.pos = 0
self.current_block = None self.current_block = None
@ -213,7 +231,7 @@ class Blocks(object):
def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False): def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
self.end_current_block() self.end_current_block()
self.current_block = Block( self.current_block = Block(
self.namespace, self.styles_manager, html_block, style, self.namespace, self.styles_manager, self.links_manager, html_block, style,
is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
self.open_html_blocks.add(html_block) self.open_html_blocks.add(html_block)
return self.current_block return self.current_block
@ -266,6 +284,10 @@ class Blocks(object):
block.parent_items = None block.parent_items = None
if block.float_spec is not None: if block.float_spec is not None:
block.float_spec.blocks.remove(block) block.float_spec.blocks.remove(block)
try:
self.all_blocks[pos].bookmarks.update(block.bookmarks)
except (IndexError, KeyError):
pass
def __enter__(self): def __enter__(self):
self.pos = len(self.all_blocks) self.pos = len(self.all_blocks)
@ -286,20 +308,29 @@ class Blocks(object):
class Convert(object): class Convert(object):
# Word does not apply default styling to hyperlinks, so we ensure they get
# default styling (the conversion pipeline does not apply any styling to
# them).
base_css = '''
a[href] { text-decoration: underline; color: blue }
'''
def __init__(self, oeb, docx): def __init__(self, oeb, docx):
self.oeb, self.docx = oeb, docx self.oeb, self.docx = oeb, docx
self.log, self.opts = docx.log, docx.opts self.log, self.opts = docx.log, docx.opts
def __call__(self): def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
self.svg_rasterizer = SVGRasterizer() self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
self.svg_rasterizer(self.oeb, self.opts) self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager(self.docx.namespace) self.styles_manager = StylesManager(self.docx.namespace)
self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
self.lists_manager = ListsManager(self.docx) self.lists_manager = ListsManager(self.docx)
self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
self.blocks = Blocks(self.docx.namespace, self.styles_manager) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
self.current_link = None
for item in self.oeb.spine: for item in self.oeb.spine:
self.process_item(item) self.process_item(item)
@ -322,13 +353,15 @@ class Convert(object):
self.write() self.write()
def process_item(self, item): def process_item(self, item):
self.current_item = item
stylizer = self.svg_rasterizer.stylizer_cache.get(item) stylizer = self.svg_rasterizer.stylizer_cache.get(item)
if stylizer is None: if stylizer is None:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile, base_css=self.base_css)
self.abshref = self.images_manager.abshref = item.abshref self.abshref = self.images_manager.abshref = item.abshref
for i, body in enumerate(XPath('//h:body')(item.data)): for i, body in enumerate(XPath('//h:body')(item.data)):
with self.blocks: with self.blocks:
body.set('id', body.get('id', None) or self.links_manager.top_anchor)
self.process_tag(body, stylizer, is_first_tag=i == 0) self.process_tag(body, stylizer, is_first_tag=i == 0)
def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
@ -339,6 +372,10 @@ class Convert(object):
if tag_style.is_hidden: if tag_style.is_hidden:
return return
previous_link = self.current_link
if tagname == 'a' and html_tag.get('href'):
self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
display = tag_style._get('display') display = tag_style._get('display')
is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
if float_spec is None and is_float: if float_spec is None and is_float:
@ -376,6 +413,8 @@ class Convert(object):
if is_block and tag_style['page-break-after'] == 'avoid': if is_block and tag_style['page-break-after'] == 'avoid':
self.blocks.all_blocks[-1].keep_next = True self.blocks.all_blocks[-1].keep_next = True
self.current_link = previous_link
if display == 'table-row': if display == 'table-row':
return # We ignore the tail for these tags return # We ignore the tail for these tags
@ -384,28 +423,38 @@ class Convert(object):
# Ignore trailing space after a block tag, as otherwise it will # Ignore trailing space after a block tag, as otherwise it will
# become a new empty paragraph # become a new empty paragraph
block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link)
def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
anchor = html_tag.get('id') or html_tag.get('name')
if anchor:
block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
if tagname == 'img': if tagname == 'img':
self.images_manager.add_image(html_tag, block, stylizer) self.images_manager.add_image(html_tag, block, stylizer)
else: else:
if html_tag.text: if html_tag.text:
block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True) block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link)
def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
anchor = html_tag.get('id') or html_tag.get('name') or None
bmark = None
if anchor:
bmark = self.bookmark_for_anchor(anchor, html_tag)
if tagname == 'br': if tagname == 'br':
if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none')) block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
elif tagname == 'img': elif tagname == 'img':
block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
self.images_manager.add_image(html_tag, block, stylizer) self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
else: else:
if html_tag.text: if html_tag.text:
block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
block.add_text(html_tag.text, tag_style, is_parent_style=False) block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link)
def bookmark_for_anchor(self, anchor, html_tag):
return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)
def write(self): def write(self):
self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.docx.document, self.docx.styles, body = create_skeleton(self.opts)

View File

@ -44,7 +44,7 @@ class ImagesManager(object):
self.document_relationships = document_relationships self.document_relationships = document_relationships
self.count = 0 self.count = 0
def add_image(self, img, block, stylizer): def add_image(self, img, block, stylizer, bookmark=None):
src = img.get('src') src = img.get('src')
if not src: if not src:
return return
@ -59,7 +59,7 @@ class ImagesManager(object):
self.images[href] = Image(image_rid, image_fname, width, height, fmt, item) self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
item.unload_data_from_memory() item.unload_data_from_memory()
drawing = self.create_image_markup(img, stylizer, href) drawing = self.create_image_markup(img, stylizer, href)
block.add_image(drawing) block.add_image(drawing, bookmark=bookmark)
return self.images[href].rid return self.images[href].rid
def create_image_markup(self, html_img, stylizer, href): def create_image_markup(self, html_img, stylizer, href):

View File

@ -0,0 +1,76 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import posixpath
from uuid import uuid4
from urlparse import urlparse
def start_text(tag, prefix_len=0, top_level=True):
ans = tag.text or ''
limit = 50 - prefix_len
if len(ans) < limit:
for child in tag.iterchildren('*'):
ans += start_text(child, len(ans), top_level=False) + (child.tail or '')
if len(ans) >= limit:
break
if top_level and len(ans) > limit:
ans = ans[:limit] + '...'
return ans
class LinksManager(object):
def __init__(self, namespace, document_relationships):
self.namespace = namespace
self.docment_relationships = document_relationships
self.top_anchor = type('')(uuid4().hex)
self.anchor_map = {}
self.used_bookmark_names = set()
self.bmark_id = 0
self.document_hrefs = set()
self.external_links = {}
def bookmark_for_anchor(self, anchor, current_item, html_tag):
key = (current_item.href, anchor)
if key in self.anchor_map:
return self.anchor_map[key]
if anchor == self.top_anchor:
name = ('Top of %s' % posixpath.basename(current_item.href))
self.document_hrefs.add(current_item.href)
else:
name = start_text(html_tag).strip() or anchor
i, bname = 0, name
while name in self.used_bookmark_names:
i += 1
name = bname + (' %d' % i)
self.anchor_map[key] = name
return name
@property
def bookmark_id(self):
self.bmark_id += 1
return self.bmark_id
def serialize_hyperlink(self, parent, link):
item, url, tooltip = link
purl = urlparse(url)
href = purl.path
if not purl.scheme:
href = item.abshref(href)
if href in self.document_hrefs:
key = (href, purl.fragment or self.top_anchor)
if key in self.anchor_map:
bmark = self.anchor_map[key]
else:
bmark = self.anchor_map[(href, self.top_anchor)]
return self.namespace.makeelement(parent, 'w:hyperlink', w_anchor=bmark, w_tooltip=tooltip or '')
if purl.scheme in {'http', 'https', 'ftp'}:
if url not in self.external_links:
self.external_links[url] = self.docment_relationships.add_relationship(url, self.namespace.names['LINKS'], target_mode='External')
return self.namespace.makeelement(parent, 'w:hyperlink', r_id=self.external_links[url], w_tooltip=tooltip or '')
return parent

View File

@ -53,7 +53,7 @@ class Stylizer(object):
STYLESHEETS = WeakKeyDictionary() STYLESHEETS = WeakKeyDictionary()
def __init__(self, tree, path, oeb, opts, profile=None, def __init__(self, tree, path, oeb, opts, profile=None,
extra_css='', user_css=''): extra_css='', user_css='', base_css=''):
self.oeb, self.opts = oeb, opts self.oeb, self.opts = oeb, opts
self.profile = profile self.profile = profile
if self.profile is None: if self.profile is None:
@ -74,6 +74,8 @@ class Stylizer(object):
basename = os.path.basename(path) basename = os.path.basename(path)
cssname = os.path.splitext(basename)[0] + '.css' cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [html_css_stylesheet()] stylesheets = [html_css_stylesheet()]
if base_css:
stylesheets.append(parseString(base_css, validate=False))
style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')
# Add cssutils parsing profiles from output_profile # Add cssutils parsing profiles from output_profile

View File

@ -27,7 +27,8 @@ class Unavailable(Exception):
pass pass
class SVGRasterizer(object): class SVGRasterizer(object):
def __init__(self): def __init__(self, base_css=''):
self.base_css = base_css
from calibre.gui2 import must_use_qt from calibre.gui2 import must_use_qt
must_use_qt() must_use_qt()
@ -129,7 +130,7 @@ class SVGRasterizer(object):
ans = self.stylizer_cache.get(item, None) ans = self.stylizer_cache.get(item, None)
if ans is None: if ans is None:
ans = Stylizer(item.data, item.href, self.oeb, self.opts, ans = Stylizer(item.data, item.href, self.oeb, self.opts,
self.profile) self.profile, base_css=self.base_css)
self.stylizer_cache[item] = ans self.stylizer_cache[item] = ans
return ans return ans