mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
DOCX: Handle hyperlinks created as fields
See https://bugs.launchpad.net/calibre/+bug/1196728 for an example.
This commit is contained in:
parent
e8839bc8dc
commit
3b4094a890
106
src/calibre/ebooks/docx/fields.py
Normal file
106
src/calibre/ebooks/docx/fields.py
Normal file
@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.docx.names import XPath, get
|
||||
|
||||
class Field(object):
|
||||
|
||||
def __init__(self, start):
|
||||
self.start = start
|
||||
self.end = None
|
||||
self.contents = []
|
||||
self.instructions = []
|
||||
|
||||
def add_instr(self, elem):
|
||||
raw = elem.text
|
||||
if not raw:
|
||||
return
|
||||
name, rest = raw.strip().partition(' ')[0::2]
|
||||
self.instructions.append((name, rest.strip()))
|
||||
|
||||
WORD, FLAG = 0, 1
|
||||
scanner = re.Scanner([
|
||||
(r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x
|
||||
(r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word
|
||||
(r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote
|
||||
(r'\s+', None),
|
||||
], flags=re.DOTALL)
|
||||
|
||||
|
||||
def parse_hyperlink(raw, log):
|
||||
ans = {}
|
||||
last_option = None
|
||||
for token, token_type in scanner.scan(raw)[0]:
|
||||
if not ans:
|
||||
if token_type is not WORD:
|
||||
log('Invalid hyperlink, first token is not a URL (%s)' % raw)
|
||||
return ans
|
||||
ans['url'] = token
|
||||
if token_type is FLAG:
|
||||
last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
|
||||
if last_option is not None:
|
||||
ans[last_option] = None
|
||||
elif token_type is WORD:
|
||||
if last_option is not None:
|
||||
ans[last_option] = token
|
||||
return ans
|
||||
|
||||
|
||||
class Fields(object):
|
||||
|
||||
def __init__(self):
|
||||
self.fields = []
|
||||
|
||||
def __call__(self, doc, log):
|
||||
stack = []
|
||||
for elem in XPath(
|
||||
'//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc):
|
||||
if elem.tag.endswith('}fldChar'):
|
||||
typ = get(elem, 'w:fldCharType')
|
||||
if typ == 'begin':
|
||||
stack.append(Field(elem))
|
||||
self.fields.append(stack[-1])
|
||||
else:
|
||||
try:
|
||||
stack.pop().end = elem
|
||||
except IndexError:
|
||||
pass
|
||||
elif elem.tag.endswith('}instrText'):
|
||||
if stack:
|
||||
stack[-1].add_instr(elem)
|
||||
else:
|
||||
if stack:
|
||||
stack[-1].contents.append(elem)
|
||||
|
||||
# Parse hyperlink fields
|
||||
self.hyperlink_fields = []
|
||||
for field in self.fields:
|
||||
if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
|
||||
hl = parse_hyperlink(field.instructions[0][1], log)
|
||||
if hl:
|
||||
if 'target' in hl and hl['target'] is None:
|
||||
hl['target'] = '_blank'
|
||||
all_runs = []
|
||||
current_runs = []
|
||||
# We only handle spans in a single paragraph
|
||||
# being wrapped in <a>
|
||||
for x in field.contents:
|
||||
if x.tag.endswith('}p'):
|
||||
if current_runs:
|
||||
all_runs.append(current_runs)
|
||||
current_runs = []
|
||||
elif x.tag.endswith('}r'):
|
||||
current_runs.append(x)
|
||||
if current_runs:
|
||||
all_runs.append(current_runs)
|
||||
for runs in all_runs:
|
||||
self.hyperlink_fields.append((hl, runs))
|
||||
|
||||
|
@ -26,6 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes
|
||||
from calibre.ebooks.docx.cleanup import cleanup_markup
|
||||
from calibre.ebooks.docx.theme import Theme
|
||||
from calibre.ebooks.docx.toc import create_toc
|
||||
from calibre.ebooks.docx.fields import Fields
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
|
||||
@ -52,6 +53,7 @@ class Convert(object):
|
||||
self.body = BODY()
|
||||
self.theme = Theme()
|
||||
self.tables = Tables()
|
||||
self.fields = Fields()
|
||||
self.styles = Styles(self.tables)
|
||||
self.images = Images()
|
||||
self.object_map = OrderedDict()
|
||||
@ -79,6 +81,7 @@ class Convert(object):
|
||||
def __call__(self):
|
||||
doc = self.docx.document
|
||||
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
||||
self.fields(doc, self.log)
|
||||
self.read_styles(relationships_by_type)
|
||||
self.images(relationships_by_id)
|
||||
self.layers = OrderedDict()
|
||||
@ -396,6 +399,25 @@ class Convert(object):
|
||||
# hrefs that point nowhere give epubcheck a hernia. The element
|
||||
# should be styled explicitly by Word anyway.
|
||||
# span.set('href', '#')
|
||||
rmap = {v:k for k, v in self.object_map.iteritems()}
|
||||
for hyperlink, runs in self.fields.hyperlink_fields:
|
||||
spans = [rmap[r] for r in runs if r in rmap]
|
||||
if not spans:
|
||||
continue
|
||||
if len(spans) > 1:
|
||||
span = self.wrap_elems(spans, SPAN())
|
||||
span.tag = 'a'
|
||||
tgt = hyperlink.get('target', None)
|
||||
if tgt:
|
||||
span.set('target', tgt)
|
||||
tt = hyperlink.get('title', None)
|
||||
if tt:
|
||||
span.set('title', tt)
|
||||
url = hyperlink['url']
|
||||
if url in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[url])
|
||||
continue
|
||||
span.set('href', url)
|
||||
|
||||
def convert_run(self, run):
|
||||
ans = SPAN()
|
||||
|
Loading…
x
Reference in New Issue
Block a user