DOCX: Images

2025-07-09 03:04:10 -04:00 · 2013-05-20 23:15:22 +05:30 · 2013-05-20 23:15:22 +05:30 · 9aeb3ddf48
commit 9aeb3ddf48
parent 6b6eeba143
4 changed files with 266 additions and 2 deletions
--- a/src/calibre/ebooks/docx/images.py
+++ b/src/calibre/ebooks/docx/images.py
@ -0,0 +1,201 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+
+from lxml.html.builder import IMG
+
+from calibre.ebooks.docx.names import XPath, get, barename
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.imghdr import what
+
+def emu_to_pt(x):
+    return x / 12700
+
+def get_image_properties(parent):
+    width = height = None
+    for extent in XPath('./wp:extent')(parent):
+        try:
+            width = emu_to_pt(int(extent.get('cx')))
+        except (TypeError, ValueError):
+            pass
+        try:
+            height = emu_to_pt(int(extent.get('cy')))
+        except (TypeError, ValueError):
+            pass
+    ans = {}
+    if width is not None:
+        ans['width'] = '%.3gpt' % width
+    if height is not None:
+        ans['height'] = '%.3gpt' % height
+
+    alt = None
+    for docPr in XPath('./wp:docPr')(parent):
+        x = docPr.get('descr', None)
+        if x:
+            alt = x
+        if docPr.get('hidden', None) in {'true', 'on', '1'}:
+            ans['display'] = 'none'
+
+    return ans, alt
+
+
+def get_image_margins(elem):
+    ans = {}
+    for w, css in {'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}.iteritems():
+        val = elem.get('dist%s' % w, None)
+        if val is not None:
+            try:
+                val = emu_to_pt(val)
+            except (TypeError, ValueError):
+                continue
+            ans['padding-%s' % css] = '%.3gpt' % val
+    return ans
+
+def get_hpos(anchor, page_width):
+    # TODO: Handle relativeFrom on positionH
+    for ph in XPath('./wp:positionH')(anchor):
+        for align in XPath('./wp:align')(ph):
+            al = align.text
+            if al == 'left':
+                return 0
+            if al == 'center':
+                return 0.5
+            if al == 'right':
+                return 1
+        for po in XPath('./wp:posOffset')(ph):
+            try:
+                pos = emu_to_pt(int(po.text))
+            except (TypeError, ValueError):
+                continue
+            return pos/page_width
+
+    for sp in XPath('./wp:simplePos')(anchor):
+        try:
+            x = emu_to_pt(sp.get('x', None))
+        except (TypeError, ValueError):
+            continue
+        return x/page_width
+
+    return 0
+
+
+class Images(object):
+
+    def __init__(self):
+        self.rid_map = {}
+        self.used = {}
+        self.names = set()
+        self.all_images = set()
+
+    def __call__(self, relationships_by_id):
+        self.rid_map = relationships_by_id
+
+    def generate_filename(self, rid, base=None):
+        if rid in self.used:
+            return self.used[rid]
+        raw = self.docx.read(self.rid_map[rid])
+        base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_')
+        ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
+        base = base.rpartition('.')[0] + '.' + ext
+        exists = frozenset(self.used.itervalues())
+        c = 1
+        while base in exists:
+            n, e = base.rpartition('.')[0::2]
+            base = '%s-%d.%s' % (n, c, e)
+            c += 1
+        self.used[rid] = base
+        with open(os.path.join(self.dest_dir, base), 'wb') as f:
+            f.write(raw)
+        self.all_images.add('images/' + base)
+        return base
+
+    def pic_to_img(self, pic, alt=None):
+        name = None
+        for pr in XPath('descendant::pic:cNvPr')(pic):
+            name = pr.get('name', None)
+            if name:
+                name = ascii_filename(name).replace(' ', '_')
+            alt = pr.get('descr', None)
+            for a in XPath('descendant::a:blip[@r:embed]')(pic):
+                rid = get(a, 'r:embed')
+                if rid in self.rid_map:
+                    src = self.generate_filename(rid, name)
+                    img = IMG(src='images/%s' % src)
+                    if alt:
+                        img(alt=alt)
+                    return img
+
+    def drawing_to_html(self, drawing, page):
+        # First process the inline pictures
+        for inline in XPath('./wp:inline')(drawing):
+            style, alt = get_image_properties(inline)
+            for pic in XPath('descendant::pic:pic')(inline):
+                ans = self.pic_to_img(pic, alt)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
+                    yield ans
+
+        # Now process the floats
+        for anchor in XPath('./wp:anchor')(drawing):
+            style, alt = get_image_properties(anchor)
+            self.get_float_properties(anchor, style, page)
+            for pic in XPath('descendant::pic:pic')(anchor):
+                ans = self.pic_to_img(pic, alt)
+                if ans is not None:
+                    if style:
+                        ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
+                    yield ans
+
+    def get_float_properties(self, anchor, style, page):
+        if 'display' not in style:
+            style['display'] = 'block'
+        padding = get_image_margins(anchor)
+        width = float(style.get('width', '100pt')[:-2])
+
+        page_width = page.width - page.margin_left - page.margin_right
+
+        hpos = get_hpos(anchor, page_width) + width/(2*page_width)
+
+        wrap_elem = None
+        dofloat = False
+
+        for child in reversed(anchor):
+            bt = barename(child.tag)
+            if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
+                wrap_elem = child
+                dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
+                break
+
+        if wrap_elem is not None:
+            padding.update(get_image_margins(wrap_elem))
+            wt = wrap_elem.get('wrapText', None)
+            hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
+            if dofloat:
+                style['float'] = 'left' if hpos < 0.65 else 'right'
+            else:
+                ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
+                if ml is not None:
+                    style['margin-left'] = ml
+                if mr is not None:
+                    style['margin-right'] = mr
+
+        style.update(padding)
+
+    def to_html(self, elem, page, docx, dest_dir):
+        dest = os.path.join(dest_dir, 'images')
+        if not os.path.exists(dest):
+            os.mkdir(dest)
+        self.dest_dir, self.docx = dest, docx
+        if elem.tag.endswith('}drawing'):
+            for tag in self.drawing_to_html(elem, page):
+                yield tag
+        # TODO: Handle w:pict
+
+
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -14,6 +14,7 @@ APPPROPS  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
 STYLES    = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
 NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
 FONTS     = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
+IMAGES    = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'

 namespaces = {
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
--- a/src/calibre/ebooks/docx/styles.py
+++ b/src/calibre/ebooks/docx/styles.py
@ -13,6 +13,38 @@ from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
 from calibre.ebooks.docx.char_styles import RunStyle
 from calibre.ebooks.docx.names import XPath, get

+class PageProperties(object):
+
+    '''
+    Class representing page level properties (page size/margins) read from
+    sectPr elements.
+    '''
+
+    def __init__(self, elems=()):
+        self.width = self.height = 595.28, 841.89  # pts, A4
+        self.margin_left = self.margin_right = 72  # pts
+        for sectPr in elems:
+            for pgSz in XPath('./w:pgSz')(sectPr):
+                w, h = get(pgSz, 'w:w'), get(pgSz, 'w:h')
+                try:
+                    self.width = int(w)/20
+                except (ValueError, TypeError):
+                    pass
+                try:
+                    self.height = int(h)/20
+                except (ValueError, TypeError):
+                    pass
+            for pgMar in XPath('./w:pgMar')(sectPr):
+                l, r = get(pgMar, 'w:left'), get(pgMar, 'w:right')
+                try:
+                    self.margin_left = int(l)/20
+                except (ValueError, TypeError):
+                    pass
+                try:
+                    self.margin_right = int(r)/20
+                except (ValueError, TypeError):
+                    pass
+

 class Style(object):
    '''
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -15,9 +15,10 @@ from lxml.html.builder import (

 from calibre.ebooks.docx.container import DOCX, fromstring
 from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS
-from calibre.ebooks.docx.styles import Styles, inherit
+from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
 from calibre.ebooks.docx.numbering import Numbering
 from calibre.ebooks.docx.fonts import Fonts
+from calibre.ebooks.docx.images import Images
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

 class Text:
@ -38,6 +39,7 @@ class Convert(object):
        self.mi = self.docx.metadata
        self.body = BODY()
        self.styles = Styles()
+        self.images = Images()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
@ -64,8 +66,12 @@ class Convert(object):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
+        self.images(relationships_by_id)
        self.layers = OrderedDict()
-        for wp in XPath('//w:p')(doc):
+
+        self.read_page_properties(doc)
+        for wp, page_properties in self.page_map.iteritems():
+            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)
        # TODO: tables <w:tbl> child of <w:body> (nested tables?)
@ -102,6 +108,25 @@ class Convert(object):
                        html_obj.set('class', cls)
        self.write()

+    def read_page_properties(self, doc):
+        current = []
+        self.page_map = OrderedDict()
+
+        for p in XPath('//w:p')(doc):
+            sect = XPath('descendant::w:sectPr')(p)
+            if sect:
+                pr = PageProperties(sect)
+                for x in current + [p]:
+                    self.page_map[x] = pr
+                current = []
+            else:
+                current.append(p)
+        if current:
+            last = XPath('./w:body/w:sectPr')(doc)
+            pr = PageProperties(last)
+            for x in current:
+                self.page_map[x] = pr
+
    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
@ -239,6 +264,10 @@ class Convert(object):
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
+            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
+                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
+                    text.add_elem(img)
+                    ans.append(text.elem)
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

@ -253,3 +282,4 @@ if __name__ == '__main__':
    from calibre.utils.logging import default_log
    default_log.filter_level = default_log.DEBUG
    Convert(sys.argv[-1], log=default_log)()
+