diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py new file mode 100644 index 0000000000..9b0d053cd0 --- /dev/null +++ b/src/calibre/ebooks/docx/fields.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import re + +from calibre.ebooks.docx.names import XPath, get + +class Field(object): + + def __init__(self, start): + self.start = start + self.end = None + self.contents = [] + self.instructions = [] + + def add_instr(self, elem): + raw = elem.text + if not raw: + return + name, rest = raw.strip().partition(' ')[0::2] + self.instructions.append((name, rest.strip())) + +WORD, FLAG = 0, 1 +scanner = re.Scanner([ + (r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x + (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word + (r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote + (r'\s+', None), +], flags=re.DOTALL) + + +def parse_hyperlink(raw, log): + ans = {} + last_option = None + for token, token_type in scanner.scan(raw)[0]: + if not ans: + if token_type is not WORD: + log('Invalid hyperlink, first token is not a URL (%s)' % raw) + return ans + ans['url'] = token + if token_type is FLAG: + last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None) + if last_option is not None: + ans[last_option] = None + elif token_type is WORD: + if last_option is not None: + ans[last_option] = token + return ans + + +class Fields(object): + + def __init__(self): + self.fields = [] + + def __call__(self, doc, log): + stack = [] + for elem in XPath( + '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc): + if elem.tag.endswith('}fldChar'): + typ = get(elem, 'w:fldCharType') + if typ == 'begin': + stack.append(Field(elem)) + self.fields.append(stack[-1]) + else: + try: + stack.pop().end = elem + except IndexError: + pass + elif elem.tag.endswith('}instrText'): + if stack: + stack[-1].add_instr(elem) + else: + if stack: + stack[-1].contents.append(elem) + + # Parse hyperlink fields + self.hyperlink_fields = [] + for field in self.fields: + if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK': + hl = parse_hyperlink(field.instructions[0][1], log) + if hl: + if 'target' in hl and hl['target'] is None: + hl['target'] = '_blank' + all_runs = [] + current_runs = [] + # We only handle spans in a single paragraph + # being wrapped in + for x in field.contents: + if x.tag.endswith('}p'): + if current_runs: + all_runs.append(current_runs) + current_runs = [] + elif x.tag.endswith('}r'): + current_runs.append(x) + if current_runs: + all_runs.append(current_runs) + for runs in all_runs: + self.hyperlink_fields.append((hl, runs)) + + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 79020d9c0a..647b021205 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -26,6 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes from calibre.ebooks.docx.cleanup import cleanup_markup from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.toc import create_toc +from calibre.ebooks.docx.fields import Fields from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 @@ -52,6 +53,7 @@ class Convert(object): self.body = BODY() self.theme = Theme() self.tables = Tables() + self.fields = Fields() self.styles = Styles(self.tables) self.images = Images() self.object_map = OrderedDict() @@ -79,6 +81,7 @@ class Convert(object): def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships + self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() @@ -396,6 +399,25 @@ class Convert(object): # hrefs that point nowhere give epubcheck a hernia. The element # should be styled explicitly by Word anyway. # span.set('href', '#') + rmap = {v:k for k, v in self.object_map.iteritems()} + for hyperlink, runs in self.fields.hyperlink_fields: + spans = [rmap[r] for r in runs if r in rmap] + if not spans: + continue + if len(spans) > 1: + span = self.wrap_elems(spans, SPAN()) + span.tag = 'a' + tgt = hyperlink.get('target', None) + if tgt: + span.set('target', tgt) + tt = hyperlink.get('title', None) + if tt: + span.set('title', tt) + url = hyperlink['url'] + if url in self.anchor_map: + span.set('href', '#' + self.anchor_map[url]) + continue + span.set('href', url) def convert_run(self, run): ans = SPAN()