mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
DOCX: Images
This commit is contained in:
parent
6b6eeba143
commit
9aeb3ddf48
201
src/calibre/ebooks/docx/images.py
Normal file
201
src/calibre/ebooks/docx/images.py
Normal file
@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os
|
||||
|
||||
from lxml.html.builder import IMG
|
||||
|
||||
from calibre.ebooks.docx.names import XPath, get, barename
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.imghdr import what
|
||||
|
||||
def emu_to_pt(x):
|
||||
return x / 12700
|
||||
|
||||
def get_image_properties(parent):
|
||||
width = height = None
|
||||
for extent in XPath('./wp:extent')(parent):
|
||||
try:
|
||||
width = emu_to_pt(int(extent.get('cx')))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
try:
|
||||
height = emu_to_pt(int(extent.get('cy')))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
ans = {}
|
||||
if width is not None:
|
||||
ans['width'] = '%.3gpt' % width
|
||||
if height is not None:
|
||||
ans['height'] = '%.3gpt' % height
|
||||
|
||||
alt = None
|
||||
for docPr in XPath('./wp:docPr')(parent):
|
||||
x = docPr.get('descr', None)
|
||||
if x:
|
||||
alt = x
|
||||
if docPr.get('hidden', None) in {'true', 'on', '1'}:
|
||||
ans['display'] = 'none'
|
||||
|
||||
return ans, alt
|
||||
|
||||
|
||||
def get_image_margins(elem):
|
||||
ans = {}
|
||||
for w, css in {'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}.iteritems():
|
||||
val = elem.get('dist%s' % w, None)
|
||||
if val is not None:
|
||||
try:
|
||||
val = emu_to_pt(val)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
ans['padding-%s' % css] = '%.3gpt' % val
|
||||
return ans
|
||||
|
||||
def get_hpos(anchor, page_width):
|
||||
# TODO: Handle relativeFrom on positionH
|
||||
for ph in XPath('./wp:positionH')(anchor):
|
||||
for align in XPath('./wp:align')(ph):
|
||||
al = align.text
|
||||
if al == 'left':
|
||||
return 0
|
||||
if al == 'center':
|
||||
return 0.5
|
||||
if al == 'right':
|
||||
return 1
|
||||
for po in XPath('./wp:posOffset')(ph):
|
||||
try:
|
||||
pos = emu_to_pt(int(po.text))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return pos/page_width
|
||||
|
||||
for sp in XPath('./wp:simplePos')(anchor):
|
||||
try:
|
||||
x = emu_to_pt(sp.get('x', None))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return x/page_width
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class Images(object):
|
||||
|
||||
def __init__(self):
|
||||
self.rid_map = {}
|
||||
self.used = {}
|
||||
self.names = set()
|
||||
self.all_images = set()
|
||||
|
||||
def __call__(self, relationships_by_id):
|
||||
self.rid_map = relationships_by_id
|
||||
|
||||
def generate_filename(self, rid, base=None):
|
||||
if rid in self.used:
|
||||
return self.used[rid]
|
||||
raw = self.docx.read(self.rid_map[rid])
|
||||
base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_')
|
||||
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
|
||||
base = base.rpartition('.')[0] + '.' + ext
|
||||
exists = frozenset(self.used.itervalues())
|
||||
c = 1
|
||||
while base in exists:
|
||||
n, e = base.rpartition('.')[0::2]
|
||||
base = '%s-%d.%s' % (n, c, e)
|
||||
c += 1
|
||||
self.used[rid] = base
|
||||
with open(os.path.join(self.dest_dir, base), 'wb') as f:
|
||||
f.write(raw)
|
||||
self.all_images.add('images/' + base)
|
||||
return base
|
||||
|
||||
def pic_to_img(self, pic, alt=None):
|
||||
name = None
|
||||
for pr in XPath('descendant::pic:cNvPr')(pic):
|
||||
name = pr.get('name', None)
|
||||
if name:
|
||||
name = ascii_filename(name).replace(' ', '_')
|
||||
alt = pr.get('descr', None)
|
||||
for a in XPath('descendant::a:blip[@r:embed]')(pic):
|
||||
rid = get(a, 'r:embed')
|
||||
if rid in self.rid_map:
|
||||
src = self.generate_filename(rid, name)
|
||||
img = IMG(src='images/%s' % src)
|
||||
if alt:
|
||||
img(alt=alt)
|
||||
return img
|
||||
|
||||
def drawing_to_html(self, drawing, page):
|
||||
# First process the inline pictures
|
||||
for inline in XPath('./wp:inline')(drawing):
|
||||
style, alt = get_image_properties(inline)
|
||||
for pic in XPath('descendant::pic:pic')(inline):
|
||||
ans = self.pic_to_img(pic, alt)
|
||||
if ans is not None:
|
||||
if style:
|
||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
|
||||
yield ans
|
||||
|
||||
# Now process the floats
|
||||
for anchor in XPath('./wp:anchor')(drawing):
|
||||
style, alt = get_image_properties(anchor)
|
||||
self.get_float_properties(anchor, style, page)
|
||||
for pic in XPath('descendant::pic:pic')(anchor):
|
||||
ans = self.pic_to_img(pic, alt)
|
||||
if ans is not None:
|
||||
if style:
|
||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems()))
|
||||
yield ans
|
||||
|
||||
def get_float_properties(self, anchor, style, page):
|
||||
if 'display' not in style:
|
||||
style['display'] = 'block'
|
||||
padding = get_image_margins(anchor)
|
||||
width = float(style.get('width', '100pt')[:-2])
|
||||
|
||||
page_width = page.width - page.margin_left - page.margin_right
|
||||
|
||||
hpos = get_hpos(anchor, page_width) + width/(2*page_width)
|
||||
|
||||
wrap_elem = None
|
||||
dofloat = False
|
||||
|
||||
for child in reversed(anchor):
|
||||
bt = barename(child.tag)
|
||||
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
|
||||
wrap_elem = child
|
||||
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
|
||||
break
|
||||
|
||||
if wrap_elem is not None:
|
||||
padding.update(get_image_margins(wrap_elem))
|
||||
wt = wrap_elem.get('wrapText', None)
|
||||
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
|
||||
if dofloat:
|
||||
style['float'] = 'left' if hpos < 0.65 else 'right'
|
||||
else:
|
||||
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
|
||||
if ml is not None:
|
||||
style['margin-left'] = ml
|
||||
if mr is not None:
|
||||
style['margin-right'] = mr
|
||||
|
||||
style.update(padding)
|
||||
|
||||
def to_html(self, elem, page, docx, dest_dir):
|
||||
dest = os.path.join(dest_dir, 'images')
|
||||
if not os.path.exists(dest):
|
||||
os.mkdir(dest)
|
||||
self.dest_dir, self.docx = dest, docx
|
||||
if elem.tag.endswith('}drawing'):
|
||||
for tag in self.drawing_to_html(elem, page):
|
||||
yield tag
|
||||
# TODO: Handle w:pict
|
||||
|
||||
|
@ -14,6 +14,7 @@ APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
|
||||
STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles'
|
||||
NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering'
|
||||
FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable'
|
||||
IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
|
||||
|
||||
namespaces = {
|
||||
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
|
||||
|
@ -13,6 +13,38 @@ from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
|
||||
from calibre.ebooks.docx.char_styles import RunStyle
|
||||
from calibre.ebooks.docx.names import XPath, get
|
||||
|
||||
class PageProperties(object):
|
||||
|
||||
'''
|
||||
Class representing page level properties (page size/margins) read from
|
||||
sectPr elements.
|
||||
'''
|
||||
|
||||
def __init__(self, elems=()):
|
||||
self.width = self.height = 595.28, 841.89 # pts, A4
|
||||
self.margin_left = self.margin_right = 72 # pts
|
||||
for sectPr in elems:
|
||||
for pgSz in XPath('./w:pgSz')(sectPr):
|
||||
w, h = get(pgSz, 'w:w'), get(pgSz, 'w:h')
|
||||
try:
|
||||
self.width = int(w)/20
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
try:
|
||||
self.height = int(h)/20
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
for pgMar in XPath('./w:pgMar')(sectPr):
|
||||
l, r = get(pgMar, 'w:left'), get(pgMar, 'w:right')
|
||||
try:
|
||||
self.margin_left = int(l)/20
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
try:
|
||||
self.margin_right = int(r)/20
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
|
||||
class Style(object):
|
||||
'''
|
||||
|
@ -15,9 +15,10 @@ from lxml.html.builder import (
|
||||
|
||||
from calibre.ebooks.docx.container import DOCX, fromstring
|
||||
from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS
|
||||
from calibre.ebooks.docx.styles import Styles, inherit
|
||||
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
|
||||
from calibre.ebooks.docx.numbering import Numbering
|
||||
from calibre.ebooks.docx.fonts import Fonts
|
||||
from calibre.ebooks.docx.images import Images
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
|
||||
class Text:
|
||||
@ -38,6 +39,7 @@ class Convert(object):
|
||||
self.mi = self.docx.metadata
|
||||
self.body = BODY()
|
||||
self.styles = Styles()
|
||||
self.images = Images()
|
||||
self.object_map = OrderedDict()
|
||||
self.html = HTML(
|
||||
HEAD(
|
||||
@ -64,8 +66,12 @@ class Convert(object):
|
||||
doc = self.docx.document
|
||||
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
||||
self.read_styles(relationships_by_type)
|
||||
self.images(relationships_by_id)
|
||||
self.layers = OrderedDict()
|
||||
for wp in XPath('//w:p')(doc):
|
||||
|
||||
self.read_page_properties(doc)
|
||||
for wp, page_properties in self.page_map.iteritems():
|
||||
self.current_page = page_properties
|
||||
p = self.convert_p(wp)
|
||||
self.body.append(p)
|
||||
# TODO: tables <w:tbl> child of <w:body> (nested tables?)
|
||||
@ -102,6 +108,25 @@ class Convert(object):
|
||||
html_obj.set('class', cls)
|
||||
self.write()
|
||||
|
||||
def read_page_properties(self, doc):
|
||||
current = []
|
||||
self.page_map = OrderedDict()
|
||||
|
||||
for p in XPath('//w:p')(doc):
|
||||
sect = XPath('descendant::w:sectPr')(p)
|
||||
if sect:
|
||||
pr = PageProperties(sect)
|
||||
for x in current + [p]:
|
||||
self.page_map[x] = pr
|
||||
current = []
|
||||
else:
|
||||
current.append(p)
|
||||
if current:
|
||||
last = XPath('./w:body/w:sectPr')(doc)
|
||||
pr = PageProperties(last)
|
||||
for x in current:
|
||||
self.page_map[x] = pr
|
||||
|
||||
def read_styles(self, relationships_by_type):
|
||||
|
||||
def get_name(rtype, defname):
|
||||
@ -239,6 +264,10 @@ class Convert(object):
|
||||
br = BR()
|
||||
text.add_elem(br)
|
||||
ans.append(text.elem)
|
||||
elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
|
||||
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
|
||||
text.add_elem(img)
|
||||
ans.append(text.elem)
|
||||
if text.buf:
|
||||
setattr(text.elem, text.attr, ''.join(text.buf))
|
||||
|
||||
@ -253,3 +282,4 @@ if __name__ == '__main__':
|
||||
from calibre.utils.logging import default_log
|
||||
default_log.filter_level = default_log.DEBUG
|
||||
Convert(sys.argv[-1], log=default_log)()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user