DOCX: Images

This commit is contained in:
Kovid Goyal 2013-05-20 23:15:22 +05:30
parent 6b6eeba143
commit 9aeb3ddf48
4 changed files with 266 additions and 2 deletions

View File

@ -0,0 +1,201 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from lxml.html.builder import IMG
from calibre.ebooks.docx.names import XPath, get, barename
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what
def emu_to_pt(x):
return x / 12700
def get_image_properties(parent):
width = height = None
for extent in XPath('./wp:extent')(parent):
try:
width = emu_to_pt(int(extent.get('cx')))
except (TypeError, ValueError):
pass
try:
height = emu_to_pt(int(extent.get('cy')))
except (TypeError, ValueError):
pass
ans = {}
if width is not None:
ans['width'] = '%.3gpt' % width
if height is not None:
ans['height'] = '%.3gpt' % height
alt = None
for docPr in XPath('./wp:docPr')(parent):
x = docPr.get('descr', None)
if x:
alt = x
if docPr.get('hidden', None) in {'true', 'on', '1'}:
ans['display'] = 'none'
return ans, alt
def get_image_margins(elem):
ans = {}
for w, css in {'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}.iteritems():
val = elem.get('dist%s' % w, None)
if val is not None:
try:
val = emu_to_pt(val)
except (TypeError, ValueError):
continue
ans['padding-%s' % css] = '%.3gpt' % val
return ans
def get_hpos(anchor, page_width):
# TODO: Handle relativeFrom on positionH
for ph in XPath('./wp:positionH')(anchor):
for align in XPath('./wp:align')(ph):
al = align.text
if al == 'left':
return 0
if al == 'center':
return 0.5
if al == 'right':
return 1
for po in XPath('./wp:posOffset')(ph):
try:
pos = emu_to_pt(int(po.text))
except (TypeError, ValueError):
continue
return pos/page_width
for sp in XPath('./wp:simplePos')(anchor):
try:
x = emu_to_pt(sp.get('x', None))
except (TypeError, ValueError):
continue
return x/page_width
return 0
class Images(object):
def __init__(self):
self.rid_map = {}
self.used = {}
self.names = set()
self.all_images = set()
def __call__(self, relationships_by_id):
self.rid_map = relationships_by_id
def generate_filename(self, rid, base=None):
if rid in self.used:
return self.used[rid]
raw = self.docx.read(self.rid_map[rid])
base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_')
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
base = base.rpartition('.')[0] + '.' + ext
exists = frozenset(self.used.itervalues())
c = 1
while base in exists:
n, e = base.rpartition('.')[0::2]
base = '%s-%d.%s' % (n, c, e)
c += 1
self.used[rid] = base
with open(os.path.join(self.dest_dir, base), 'wb') as f:
f.write(raw)
self.all_images.add('images/' + base)
return base
def pic_to_img(self, pic, alt=None):
name = None
for pr in XPath('descendant::pic:cNvPr')(pic):
name = pr.get('name', None)
if name:
name = ascii_filename(name).replace(' ', '_')
alt = pr.get('descr', None)
for a in XPath('descendant::a:blip[@r:embed]')(pic):
rid = get(a, 'r:embed')
if rid in self.rid_map:
src = self.generate_filename(rid, name)
img = IMG(src='images/%s' % src)
if alt:
img(alt=alt)
return img
def drawing_to_html(self, drawing, page):
# First process the inline pictures
for inline in XPath('./wp:inline')(drawing):
style, alt = get_image_properties(inline)
for pic in XPath('descendant::pic:pic')(inline):
ans = self.pic_to_img(pic, alt)
if ans is not None:
if style:
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
yield ans
# Now process the floats
for anchor in XPath('./wp:anchor')(drawing):
style, alt = get_image_properties(anchor)
self.get_float_properties(anchor, style, page)
for pic in XPath('descendant::pic:pic')(anchor):
ans = self.pic_to_img(pic, alt)
if ans is not None:
if style:
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
yield ans
def get_float_properties(self, anchor, style, page):
if 'display' not in style:
style['display'] = 'block'
padding = get_image_margins(anchor)
width = float(style.get('width', '100pt')[:-2])
page_width = page.width - page.margin_left - page.margin_right
hpos = get_hpos(anchor, page_width) + width/(2*page_width)
wrap_elem = None
dofloat = False
for child in reversed(anchor):
bt = barename(child.tag)
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
wrap_elem = child
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
break
if wrap_elem is not None:
padding.update(get_image_margins(wrap_elem))
wt = wrap_elem.get('wrapText', None)
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
if dofloat:
style['float'] = 'left' if hpos < 0.65 else 'right'
else:
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
if ml is not None:
style['margin-left'] = ml
if mr is not None:
style['margin-right'] = mr
style.update(padding)
def to_html(self, elem, page, docx, dest_dir):
dest = os.path.join(dest_dir, 'images')
if not os.path.exists(dest):
os.mkdir(dest)
self.dest_dir, self.docx = dest, docx
if elem.tag.endswith('}drawing'):
for tag in self.drawing_to_html(elem, page):
yield tag
# TODO: Handle w:pict

View File

@ -14,6 +14,7 @@ APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable' FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
namespaces = { namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -13,6 +13,38 @@ from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get from calibre.ebooks.docx.names import XPath, get
class PageProperties(object):
'''
Class representing page level properties (page size/margins) read from
sectPr elements.
'''
def __init__(self, elems=()):
self.width = self.height = 595.28, 841.89 # pts, A4
self.margin_left = self.margin_right = 72 # pts
for sectPr in elems:
for pgSz in XPath('./w:pgSz')(sectPr):
w, h = get(pgSz, 'w:w'), get(pgSz, 'w:h')
try:
self.width = int(w)/20
except (ValueError, TypeError):
pass
try:
self.height = int(h)/20
except (ValueError, TypeError):
pass
for pgMar in XPath('./w:pgMar')(sectPr):
l, r = get(pgMar, 'w:left'), get(pgMar, 'w:right')
try:
self.margin_left = int(l)/20
except (ValueError, TypeError):
pass
try:
self.margin_right = int(r)/20
except (ValueError, TypeError):
pass
class Style(object): class Style(object):
''' '''

View File

@ -15,9 +15,10 @@ from lxml.html.builder import (
from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS
from calibre.ebooks.docx.styles import Styles, inherit from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text: class Text:
@ -38,6 +39,7 @@ class Convert(object):
self.mi = self.docx.metadata self.mi = self.docx.metadata
self.body = BODY() self.body = BODY()
self.styles = Styles() self.styles = Styles()
self.images = Images()
self.object_map = OrderedDict() self.object_map = OrderedDict()
self.html = HTML( self.html = HTML(
HEAD( HEAD(
@ -64,8 +66,12 @@ class Convert(object):
doc = self.docx.document doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships relationships_by_id, relationships_by_type = self.docx.document_relationships
self.read_styles(relationships_by_type) self.read_styles(relationships_by_type)
self.images(relationships_by_id)
self.layers = OrderedDict() self.layers = OrderedDict()
for wp in XPath('//w:p')(doc):
self.read_page_properties(doc)
for wp, page_properties in self.page_map.iteritems():
self.current_page = page_properties
p = self.convert_p(wp) p = self.convert_p(wp)
self.body.append(p) self.body.append(p)
# TODO: tables <w:tbl> child of <w:body> (nested tables?) # TODO: tables <w:tbl> child of <w:body> (nested tables?)
@ -102,6 +108,25 @@ class Convert(object):
html_obj.set('class', cls) html_obj.set('class', cls)
self.write() self.write()
def read_page_properties(self, doc):
current = []
self.page_map = OrderedDict()
for p in XPath('//w:p')(doc):
sect = XPath('descendant::w:sectPr')(p)
if sect:
pr = PageProperties(sect)
for x in current + [p]:
self.page_map[x] = pr
current = []
else:
current.append(p)
if current:
last = XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(last)
for x in current:
self.page_map[x] = pr
def read_styles(self, relationships_by_type): def read_styles(self, relationships_by_type):
def get_name(rtype, defname): def get_name(rtype, defname):
@ -239,6 +264,10 @@ class Convert(object):
br = BR() br = BR()
text.add_elem(br) text.add_elem(br)
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
if text.buf: if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf)) setattr(text.elem, text.attr, ''.join(text.buf))
@ -253,3 +282,4 @@ if __name__ == '__main__':
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG default_log.filter_level = default_log.DEBUG
Convert(sys.argv[-1], log=default_log)() Convert(sys.argv[-1], log=default_log)()