diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index 68f74a3c82..deaf5bd4d0 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -183,7 +183,7 @@ class DOCX(object): root = fromstring(raw) for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): target = item.get('Target') - if item.get('TargetMode', None) != 'External': + if item.get('TargetMode', None) != 'External' and not target.startswith('#'): target = '/'.join((base, target.lstrip('/'))) typ = item.get('Type') Id = item.get('Id') diff --git a/src/calibre/ebooks/docx/fields.py b/src/calibre/ebooks/docx/fields.py new file mode 100644 index 0000000000..9b0d053cd0 --- /dev/null +++ b/src/calibre/ebooks/docx/fields.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import re + +from calibre.ebooks.docx.names import XPath, get + +class Field(object): + + def __init__(self, start): + self.start = start + self.end = None + self.contents = [] + self.instructions = [] + + def add_instr(self, elem): + raw = elem.text + if not raw: + return + name, rest = raw.strip().partition(' ')[0::2] + self.instructions.append((name, rest.strip())) + +WORD, FLAG = 0, 1 +scanner = re.Scanner([ + (r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x + (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word + (r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote + (r'\s+', None), +], flags=re.DOTALL) + + +def parse_hyperlink(raw, log): + ans = {} + last_option = None + for token, token_type in scanner.scan(raw)[0]: + if not ans: + if token_type is not WORD: + log('Invalid hyperlink, first token is not a URL (%s)' % raw) + return ans + ans['url'] = token + if token_type is FLAG: + last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None) + if last_option is not None: + ans[last_option] = None + elif token_type is WORD: + if last_option is not None: + ans[last_option] = token + return ans + + +class Fields(object): + + def __init__(self): + self.fields = [] + + def __call__(self, doc, log): + stack = [] + for elem in XPath( + '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc): + if elem.tag.endswith('}fldChar'): + typ = get(elem, 'w:fldCharType') + if typ == 'begin': + stack.append(Field(elem)) + self.fields.append(stack[-1]) + else: + try: + stack.pop().end = elem + except IndexError: + pass + elif elem.tag.endswith('}instrText'): + if stack: + stack[-1].add_instr(elem) + else: + if stack: + stack[-1].contents.append(elem) + + # Parse hyperlink fields + self.hyperlink_fields = [] + for field in self.fields: + if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK': + hl = parse_hyperlink(field.instructions[0][1], log) + if hl: + if 'target' in hl and hl['target'] is None: + hl['target'] = '_blank' + all_runs = [] + current_runs = [] + # We only handle spans in a single paragraph + # being wrapped in + for x in field.contents: + if x.tag.endswith('}p'): + if current_runs: + all_runs.append(current_runs) + current_runs = [] + elif x.tag.endswith('}r'): + current_runs.append(x) + if current_runs: + all_runs.append(current_runs) + for runs in all_runs: + self.hyperlink_fields.append((hl, runs)) + + diff --git a/src/calibre/ebooks/docx/images.py b/src/calibre/ebooks/docx/images.py index b0a5348d90..3be3d51c05 100644 --- a/src/calibre/ebooks/docx/images.py +++ b/src/calibre/ebooks/docx/images.py @@ -96,6 +96,7 @@ class Images(object): self.used = {} self.names = set() self.all_images = set() + self.links = [] def __call__(self, relationships_by_id): self.rid_map = relationships_by_id @@ -125,8 +126,18 @@ class Images(object): self.all_images.add('images/' + name) return name - def pic_to_img(self, pic, alt=None): + def pic_to_img(self, pic, alt, parent): name = None + link = None + for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent): + link = {'id':get(hl, 'r:id')} + tgt = hl.get('tgtFrame', None) + if tgt: + link['target'] = tgt + title = hl.get('tooltip', None) + if title: + link['title'] = title + for pr in XPath('descendant::pic:cNvPr')(pic): name = pr.get('name', None) if name: @@ -138,6 +149,8 @@ class Images(object): src = self.generate_filename(rid, name) img = IMG(src='images/%s' % src) img.set('alt', alt or 'Image') + if link is not None: + self.links.append((img, link)) return img def drawing_to_html(self, drawing, page): @@ -145,7 +158,7 @@ class Images(object): for inline in XPath('./wp:inline')(drawing): style, alt = get_image_properties(inline) for pic in XPath('descendant::pic:pic')(inline): - ans = self.pic_to_img(pic, alt) + ans = self.pic_to_img(pic, alt, inline) if ans is not None: if style: ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems())) @@ -156,7 +169,7 @@ class Images(object): style, alt = get_image_properties(anchor) self.get_float_properties(anchor, style, page) for pic in XPath('descendant::pic:pic')(anchor): - ans = self.pic_to_img(pic, alt) + ans = self.pic_to_img(pic, alt, anchor) if ans is not None: if style: ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems())) diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index 21f45616fa..4572eb59f2 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -403,6 +403,11 @@ class Styles(object): ps.margin_top = 0 last_para = p + def apply_section_page_breaks(self, paras): + for p in paras: + ps = self.resolve_paragraph(p) + ps.pageBreakBefore = True + def register(self, css, prefix): h = hash(frozenset(css.iteritems())) ans, _ = self.classes.get(h, (None, None)) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 79020d9c0a..01808657ea 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -26,6 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes from calibre.ebooks.docx.cleanup import cleanup_markup from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.toc import create_toc +from calibre.ebooks.docx.fields import Fields from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 @@ -52,6 +53,7 @@ class Convert(object): self.body = BODY() self.theme = Theme() self.tables = Tables() + self.fields = Fields() self.styles = Styles(self.tables) self.images = Images() self.object_map = OrderedDict() @@ -79,6 +81,7 @@ class Convert(object): def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships + self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() @@ -96,7 +99,11 @@ class Convert(object): p = self.convert_p(wp) self.body.append(p) paras.append(wp) + self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) + # Apply page breaks at the start of every section, except the first + # section (since that will be the start of the file) + self.styles.apply_section_page_breaks(self.section_starts[1:]) notes_header = None if self.footnotes.has_notes: @@ -177,6 +184,7 @@ class Convert(object): def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() + self.section_starts = [] for p in descendants(doc, 'w:p', 'w:tbl'): if p.tag.endswith('}tbl'): @@ -186,8 +194,10 @@ class Convert(object): sect = tuple(descendants(p, 'w:sectPr')) if sect: pr = PageProperties(sect) - for x in current + [p]: + paras = current + [p] + for x in paras: self.page_map[x] = pr + self.section_starts.append(paras[0]) current = [] else: current.append(p) @@ -287,6 +297,22 @@ class Convert(object): opf.render(of, ncx, 'toc.ncx') return os.path.join(self.dest_dir, 'metadata.opf') + def read_block_anchors(self, doc): + doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) + if doc_anchors: + current_bm = None + rmap = {v:k for k, v in self.object_map.iteritems()} + for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): + if p.tag.endswith('}p'): + if current_bm and p in rmap: + para = rmap[p] + if 'id' not in para.attrib: + para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues()))) + self.anchor_map[current_bm] = para.get('id') + current_bm = None + elif p in doc_anchors: + current_bm = get(p, 'w:name') + def convert_p(self, p): dest = P() self.object_map[dest] = p @@ -316,7 +342,13 @@ class Convert(object): elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: + old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) + if old_anchor is not None: + # The previous anchor was not applied to any element + for a, t in tuple(self.anchor_map.iteritems()): + if t == old_anchor: + self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x @@ -396,6 +428,46 @@ class Convert(object): # hrefs that point nowhere give epubcheck a hernia. The element # should be styled explicitly by Word anyway. # span.set('href', '#') + rmap = {v:k for k, v in self.object_map.iteritems()} + for hyperlink, runs in self.fields.hyperlink_fields: + spans = [rmap[r] for r in runs if r in rmap] + if not spans: + continue + if len(spans) > 1: + span = self.wrap_elems(spans, SPAN()) + span.tag = 'a' + tgt = hyperlink.get('target', None) + if tgt: + span.set('target', tgt) + tt = hyperlink.get('title', None) + if tt: + span.set('title', tt) + url = hyperlink['url'] + if url in self.anchor_map: + span.set('href', '#' + self.anchor_map[url]) + continue + span.set('href', url) + + for img, link in self.images.links: + parent = img.getparent() + idx = parent.index(img) + a = A(img) + a.tail, img.tail = img.tail, None + parent.insert(idx, a) + tgt = link.get('target', None) + if tgt: + a.set('target', tgt) + tt = link.get('title', None) + if tt: + a.set('title', tt) + rid = link['id'] + if rid in relationships_by_id: + dest = relationships_by_id[rid] + if dest.startswith('#'): + if dest[1:] in self.anchor_map: + a.set('href', '#' + self.anchor_map[dest[1:]]) + else: + a.set('href', dest) def convert_run(self, run): ans = SPAN() diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 605a58a31f..36fe6b3167 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -339,6 +339,8 @@ class FlowSplitter(object): # We want to keep the descendants of the split point in # Tree 1 keep_descendants = True + # We want the split point element, but not its tail + elem.tail = '\n' continue if hit_split_point: @@ -357,6 +359,18 @@ class FlowSplitter(object): for elem in tuple(body2.iterdescendants()): if elem is split_point2: if not before: + # Keep the split point element's tail, if it contains non-whitespace + # text + tail = elem.tail + if tail and not tail.isspace(): + parent = elem.getparent() + idx = parent.index(elem) + if idx == 0: + parent.text = (parent.text or '') + tail + else: + sib = parent[idx-1] + sib.tail = (sib.tail or '') + tail + # Remove the element itself nix_element(elem) break if elem in ancestors: