Merge branch 'docx'

Fixes #1196728 [Private bug](https://bugs.launchpad.net/calibre/+bug/1196728)
2025-07-09 03:04:10 -04:00 · 2013-07-03 13:37:25 +05:30 · 2013-07-03 13:37:25 +05:30 · 541db88ebf
commit 541db88ebf
parent e8839bc8dc c8c3741d34
6 changed files with 215 additions and 5 deletions
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@ -183,7 +183,7 @@ class DOCX(object):
            root = fromstring(raw)
            for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
                target = item.get('Target')
-                if item.get('TargetMode', None) != 'External':
+                if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
                    target = '/'.join((base, target.lstrip('/')))
                typ = item.get('Type')
                Id = item.get('Id')
--- a/src/calibre/ebooks/docx/fields.py
+++ b/src/calibre/ebooks/docx/fields.py
@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+
+from calibre.ebooks.docx.names import XPath, get
+
+class Field(object):
+
+    def __init__(self, start):
+        self.start = start
+        self.end = None
+        self.contents = []
+        self.instructions = []
+
+    def add_instr(self, elem):
+        raw = elem.text
+        if not raw:
+            return
+        name, rest = raw.strip().partition(' ')[0::2]
+        self.instructions.append((name, rest.strip()))
+
+WORD, FLAG = 0, 1
+scanner = re.Scanner([
+    (r'\\\S{1}', lambda s, t: (t, FLAG)),  # A flag of the form \x
+    (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)),  # Quoted word
+    (r'[^\s\\"]\S*', lambda s, t: (t, WORD)),  # A non-quoted word, must not start with a backslash or a space or a quote
+    (r'\s+', None),
+], flags=re.DOTALL)
+
+
+def parse_hyperlink(raw, log):
+    ans = {}
+    last_option = None
+    for token, token_type in scanner.scan(raw)[0]:
+        if not ans:
+            if token_type is not WORD:
+                log('Invalid hyperlink, first token is not a URL (%s)' % raw)
+                return ans
+            ans['url'] = token
+        if token_type is FLAG:
+            last_option = {'l':'anchor', 'm':'image-map', 'n':'target', 'o':'title', 't':'target'}.get(token[1], None)
+            if last_option is not None:
+                ans[last_option] = None
+        elif token_type is WORD:
+            if last_option is not None:
+                ans[last_option] = token
+    return ans
+
+
+class Fields(object):
+
+    def __init__(self):
+        self.fields = []
+
+    def __call__(self, doc, log):
+        stack = []
+        for elem in XPath(
+            '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]')(doc):
+            if elem.tag.endswith('}fldChar'):
+                typ = get(elem, 'w:fldCharType')
+                if typ == 'begin':
+                    stack.append(Field(elem))
+                    self.fields.append(stack[-1])
+                else:
+                    try:
+                        stack.pop().end = elem
+                    except IndexError:
+                        pass
+            elif elem.tag.endswith('}instrText'):
+                if stack:
+                    stack[-1].add_instr(elem)
+            else:
+                if stack:
+                    stack[-1].contents.append(elem)
+
+        # Parse hyperlink fields
+        self.hyperlink_fields = []
+        for field in self.fields:
+            if len(field.instructions) == 1 and field.instructions[0][0] == 'HYPERLINK':
+                hl = parse_hyperlink(field.instructions[0][1], log)
+                if hl:
+                    if 'target' in hl and hl['target'] is None:
+                        hl['target'] = '_blank'
+                    all_runs = []
+                    current_runs = []
+                    # We only handle spans in a single paragraph
+                    # being wrapped in <a>
+                    for x in field.contents:
+                        if x.tag.endswith('}p'):
+                            if current_runs:
+                                all_runs.append(current_runs)
+                            current_runs = []
+                        elif x.tag.endswith('}r'):
+                            current_runs.append(x)
+                    if current_runs:
+                        all_runs.append(current_runs)
+                    for runs in all_runs:
+                        self.hyperlink_fields.append((hl, runs))
+
+
--- a/src/calibre/ebooks/docx/images.py
+++ b/src/calibre/ebooks/docx/images.py
@ -96,6 +96,7 @@ class Images(object):
        self.used = {}
        self.names = set()
        self.all_images = set()
+        self.links = []

    def __call__(self, relationships_by_id):
        self.rid_map = relationships_by_id
@ -125,8 +126,18 @@ class Images(object):
        self.all_images.add('images/' + name)
        return name

-    def pic_to_img(self, pic, alt=None):
+    def pic_to_img(self, pic, alt, parent):
        name = None
+        link = None
+        for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
+            link = {'id':get(hl, 'r:id')}
+            tgt = hl.get('tgtFrame', None)
+            if tgt:
+                link['target'] = tgt
+            title = hl.get('tooltip', None)
+            if title:
+                link['title'] = title
+
        for pr in XPath('descendant::pic:cNvPr')(pic):
            name = pr.get('name', None)
            if name:
@ -138,6 +149,8 @@ class Images(object):
                    src = self.generate_filename(rid, name)
                    img = IMG(src='images/%s' % src)
                    img.set('alt', alt or 'Image')
+                    if link is not None:
+                        self.links.append((img, link))
                    return img

    def drawing_to_html(self, drawing, page):
@ -145,7 +158,7 @@ class Images(object):
        for inline in XPath('./wp:inline')(drawing):
            style, alt = get_image_properties(inline)
            for pic in XPath('descendant::pic:pic')(inline):
-                ans = self.pic_to_img(pic, alt)
+                ans = self.pic_to_img(pic, alt, inline)
                if ans is not None:
                    if style:
                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
@ -156,7 +169,7 @@ class Images(object):
            style, alt = get_image_properties(anchor)
            self.get_float_properties(anchor, style, page)
            for pic in XPath('descendant::pic:pic')(anchor):
-                ans = self.pic_to_img(pic, alt)
+                ans = self.pic_to_img(pic, alt, anchor)
                if ans is not None:
                    if style:
                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@ -403,6 +403,11 @@ class Styles(object):
                        ps.margin_top = 0
            last_para = p

+    def apply_section_page_breaks(self, paras):
+        for p in paras:
+            ps = self.resolve_paragraph(p)
+            ps.pageBreakBefore = True
+
    def register(self, css, prefix):
        h = hash(frozenset(css.iteritems()))
        ans, _ = self.classes.get(h, (None, None))
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -26,6 +26,7 @@ from calibre.ebooks.docx.footnotes import Footnotes
 from calibre.ebooks.docx.cleanup import cleanup_markup
 from calibre.ebooks.docx.theme import Theme
 from calibre.ebooks.docx.toc import create_toc
+from calibre.ebooks.docx.fields import Fields
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

@ -52,6 +53,7 @@ class Convert(object):
        self.body = BODY()
        self.theme = Theme()
        self.tables = Tables()
+        self.fields = Fields()
        self.styles = Styles(self.tables)
        self.images = Images()
        self.object_map = OrderedDict()
@ -79,6 +81,7 @@ class Convert(object):
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
+        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
@ -96,7 +99,11 @@ class Convert(object):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
+        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
+        # Apply page breaks at the start of every section, except the first
+        # section (since that will be the start of the file)
+        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        if self.footnotes.has_notes:
@ -177,6 +184,7 @@ class Convert(object):
    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
+        self.section_starts = []

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
@ -186,8 +194,10 @@ class Convert(object):
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
-                for x in current + [p]:
+                paras = current + [p]
+                for x in paras:
                    self.page_map[x] = pr
+                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)
@ -287,6 +297,22 @@ class Convert(object):
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

+    def read_block_anchors(self, doc):
+        doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
+        if doc_anchors:
+            current_bm = None
+            rmap = {v:k for k, v in self.object_map.iteritems()}
+            for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
+                if p.tag.endswith('}p'):
+                    if current_bm and p in rmap:
+                        para = rmap[p]
+                        if 'id' not in para.attrib:
+                            para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues())))
+                        self.anchor_map[current_bm] = para.get('id')
+                        current_bm = None
+                elif p in doc_anchors:
+                    current_bm = get(p, 'w:name')
+
    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
@ -316,7 +342,13 @@ class Convert(object):
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
+                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
+                    if old_anchor is not None:
+                        # The previous anchor was not applied to any element
+                        for a, t in tuple(self.anchor_map.iteritems()):
+                            if t == old_anchor:
+                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

@ -396,6 +428,46 @@ class Convert(object):
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
+        rmap = {v:k for k, v in self.object_map.iteritems()}
+        for hyperlink, runs in self.fields.hyperlink_fields:
+            spans = [rmap[r] for r in runs if r in rmap]
+            if not spans:
+                continue
+            if len(spans) > 1:
+                span = self.wrap_elems(spans, SPAN())
+            span.tag = 'a'
+            tgt = hyperlink.get('target', None)
+            if tgt:
+                span.set('target', tgt)
+            tt = hyperlink.get('title', None)
+            if tt:
+                span.set('title', tt)
+            url = hyperlink['url']
+            if url in self.anchor_map:
+                span.set('href', '#' + self.anchor_map[url])
+                continue
+            span.set('href', url)
+
+        for img, link in self.images.links:
+            parent = img.getparent()
+            idx = parent.index(img)
+            a = A(img)
+            a.tail, img.tail = img.tail, None
+            parent.insert(idx, a)
+            tgt = link.get('target', None)
+            if tgt:
+                a.set('target', tgt)
+            tt = link.get('title', None)
+            if tt:
+                a.set('title', tt)
+            rid = link['id']
+            if rid in relationships_by_id:
+                dest = relationships_by_id[rid]
+                if dest.startswith('#'):
+                    if dest[1:] in self.anchor_map:
+                        a.set('href', '#' + self.anchor_map[dest[1:]])
+                else:
+                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -339,6 +339,8 @@ class FlowSplitter(object):
                    # We want to keep the descendants of the split point in
                    # Tree 1
                    keep_descendants = True
+                    # We want the split point element, but not its tail
+                    elem.tail = '\n'

                continue
            if hit_split_point:
@ -357,6 +359,18 @@ class FlowSplitter(object):
        for elem in tuple(body2.iterdescendants()):
            if elem is split_point2:
                if not before:
+                    # Keep the split point element's tail, if it contains non-whitespace
+                    # text
+                    tail = elem.tail
+                    if tail and not tail.isspace():
+                        parent = elem.getparent()
+                        idx = parent.index(elem)
+                        if idx == 0:
+                            parent.text = (parent.text or '') + tail
+                        else:
+                            sib = parent[idx-1]
+                            sib.tail = (sib.tail or '') + tail
+                    # Remove the element itself
                    nix_element(elem)
                break
            if elem in ancestors: