DOCX Input: URL less hyperlink fields and stacked block level bookmarks

DOCX Input: Add support for hyperlink fields that have only anchors and
not URLs

DOCX Input: Fix handling of multiple block level bookmarks at the same
location.

Fixes #1241451 [links are not working](https://bugs.launchpad.net/calibre/+bug/1241451)
This commit is contained in:
Kovid Goyal 2013-10-22 15:46:31 +05:30
parent 93c872e8f3
commit e260906211
2 changed files with 40 additions and 16 deletions

View File

@ -40,18 +40,16 @@ def parse_hyperlink(raw, log):
raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
for token, token_type in scanner.scan(raw)[0]:
token = token.replace('\x01', '\\').replace('\x02', '"')
if not ans:
if token_type is not WORD:
log('Invalid hyperlink, first token is not a URL (%s)' % raw)
return ans
ans['url'] = token
if token_type is FLAG:
last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
if last_option is not None:
ans[last_option] = None
elif token_type is WORD:
if last_option is not None:
if last_option is None:
ans['url'] = token
else:
ans[last_option] = token
last_option = None
return ans
@ -105,4 +103,20 @@ class Fields(object):
for runs in all_runs:
self.hyperlink_fields.append((hl, runs))
def test_parse_hyperlink():
import unittest
class TestParseHyperLink(unittest.TestCase):
def test_parsing(self):
self.assertEqual(parse_hyperlink(
r'\l anchor1', None), {'anchor':'anchor1'})
self.assertEqual(parse_hyperlink(
r'www.calibre-ebook.com', None), {'url':'www.calibre-ebook.com'})
self.assertEqual(parse_hyperlink(
r'www.calibre-ebook.com \t target \o tt', None), {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
self.assertEqual(parse_hyperlink(
r'"c:\\Some Folder"', None), {'url': 'c:\\Some Folder'})
suite = unittest.TestLoader().loadTestsFromTestCase(TestParseHyperLink)
unittest.TextTestRunner(verbosity=4).run(suite)

View File

@ -346,18 +346,21 @@ class Convert(object):
def read_block_anchors(self, doc):
doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
if doc_anchors:
current_bm = None
current_bm = set()
rmap = {v:k for k, v in self.object_map.iteritems()}
for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
if 'id' not in para.attrib:
para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues())))
self.anchor_map[current_bm] = para.get('id')
current_bm = None
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.itervalues())))
for name in current_bm:
self.anchor_map[name] = para.get('id')
current_bm = set()
elif p in doc_anchors:
current_bm = get(p, 'w:name')
anchor = get(p, 'w:name')
if anchor:
current_bm.add(anchor)
def convert_p(self, p):
dest = P()
@ -500,11 +503,18 @@ class Convert(object):
tt = hyperlink.get('title', None)
if tt:
span.set('title', tt)
url = hyperlink['url']
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
continue
span.set('href', url)
url = hyperlink.get('url', None)
if url is None:
anchor = hyperlink.get('anchor', None)
if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
else:
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
continue
span.set('href', url)
for img, link in self.images.links:
parent = img.getparent()