diff --git a/src/calibre/ebooks/docx/images.py b/src/calibre/ebooks/docx/images.py new file mode 100644 index 0000000000..583849728c --- /dev/null +++ b/src/calibre/ebooks/docx/images.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import os + +from lxml.html.builder import IMG + +from calibre.ebooks.docx.names import XPath, get, barename +from calibre.utils.filenames import ascii_filename +from calibre.utils.imghdr import what + +def emu_to_pt(x): + return x / 12700 + +def get_image_properties(parent): + width = height = None + for extent in XPath('./wp:extent')(parent): + try: + width = emu_to_pt(int(extent.get('cx'))) + except (TypeError, ValueError): + pass + try: + height = emu_to_pt(int(extent.get('cy'))) + except (TypeError, ValueError): + pass + ans = {} + if width is not None: + ans['width'] = '%.3gpt' % width + if height is not None: + ans['height'] = '%.3gpt' % height + + alt = None + for docPr in XPath('./wp:docPr')(parent): + x = docPr.get('descr', None) + if x: + alt = x + if docPr.get('hidden', None) in {'true', 'on', '1'}: + ans['display'] = 'none' + + return ans, alt + + +def get_image_margins(elem): + ans = {} + for w, css in {'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}.iteritems(): + val = elem.get('dist%s' % w, None) + if val is not None: + try: + val = emu_to_pt(val) + except (TypeError, ValueError): + continue + ans['padding-%s' % css] = '%.3gpt' % val + return ans + +def get_hpos(anchor, page_width): + # TODO: Handle relativeFrom on positionH + for ph in XPath('./wp:positionH')(anchor): + for align in XPath('./wp:align')(ph): + al = align.text + if al == 'left': + return 0 + if al == 'center': + return 0.5 + if al == 'right': + return 1 + for po in XPath('./wp:posOffset')(ph): + try: + pos = emu_to_pt(int(po.text)) + except (TypeError, ValueError): + continue + return pos/page_width + + for sp in XPath('./wp:simplePos')(anchor): + try: + x = emu_to_pt(sp.get('x', None)) + except (TypeError, ValueError): + continue + return x/page_width + + return 0 + + +class Images(object): + + def __init__(self): + self.rid_map = {} + self.used = {} + self.names = set() + self.all_images = set() + + def __call__(self, relationships_by_id): + self.rid_map = relationships_by_id + + def generate_filename(self, rid, base=None): + if rid in self.used: + return self.used[rid] + raw = self.docx.read(self.rid_map[rid]) + base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_') + ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' + base = base.rpartition('.')[0] + '.' + ext + exists = frozenset(self.used.itervalues()) + c = 1 + while base in exists: + n, e = base.rpartition('.')[0::2] + base = '%s-%d.%s' % (n, c, e) + c += 1 + self.used[rid] = base + with open(os.path.join(self.dest_dir, base), 'wb') as f: + f.write(raw) + self.all_images.add('images/' + base) + return base + + def pic_to_img(self, pic, alt=None): + name = None + for pr in XPath('descendant::pic:cNvPr')(pic): + name = pr.get('name', None) + if name: + name = ascii_filename(name).replace(' ', '_') + alt = pr.get('descr', None) + for a in XPath('descendant::a:blip[@r:embed]')(pic): + rid = get(a, 'r:embed') + if rid in self.rid_map: + src = self.generate_filename(rid, name) + img = IMG(src='images/%s' % src) + if alt: + img(alt=alt) + return img + + def drawing_to_html(self, drawing, page): + # First process the inline pictures + for inline in XPath('./wp:inline')(drawing): + style, alt = get_image_properties(inline) + for pic in XPath('descendant::pic:pic')(inline): + ans = self.pic_to_img(pic, alt) + if ans is not None: + if style: + ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems())) + yield ans + + # Now process the floats + for anchor in XPath('./wp:anchor')(drawing): + style, alt = get_image_properties(anchor) + self.get_float_properties(anchor, style, page) + for pic in XPath('descendant::pic:pic')(anchor): + ans = self.pic_to_img(pic, alt) + if ans is not None: + if style: + ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems())) + yield ans + + def get_float_properties(self, anchor, style, page): + if 'display' not in style: + style['display'] = 'block' + padding = get_image_margins(anchor) + width = float(style.get('width', '100pt')[:-2]) + + page_width = page.width - page.margin_left - page.margin_right + + hpos = get_hpos(anchor, page_width) + width/(2*page_width) + + wrap_elem = None + dofloat = False + + for child in reversed(anchor): + bt = barename(child.tag) + if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}: + wrap_elem = child + dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'} + break + + if wrap_elem is not None: + padding.update(get_image_margins(wrap_elem)) + wt = wrap_elem.get('wrapText', None) + hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos + if dofloat: + style['float'] = 'left' if hpos < 0.65 else 'right' + else: + ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto') + if ml is not None: + style['margin-left'] = ml + if mr is not None: + style['margin-right'] = mr + + style.update(padding) + + def to_html(self, elem, page, docx, dest_dir): + dest = os.path.join(dest_dir, 'images') + if not os.path.exists(dest): + os.mkdir(dest) + self.dest_dir, self.docx = dest, docx + if elem.tag.endswith('}drawing'): + for tag in self.drawing_to_html(elem, page): + yield tag + # TODO: Handle w:pict + + diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index da643dcc2c..d6cecdeeb6 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -14,6 +14,7 @@ APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships STYLES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles' NUMBERING = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering' FONTS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable' +IMAGES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index c17418d0dd..2816156a67 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -13,6 +13,38 @@ from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.names import XPath, get +class PageProperties(object): + + ''' + Class representing page level properties (page size/margins) read from + sectPr elements. + ''' + + def __init__(self, elems=()): + self.width = self.height = 595.28, 841.89 # pts, A4 + self.margin_left = self.margin_right = 72 # pts + for sectPr in elems: + for pgSz in XPath('./w:pgSz')(sectPr): + w, h = get(pgSz, 'w:w'), get(pgSz, 'w:h') + try: + self.width = int(w)/20 + except (ValueError, TypeError): + pass + try: + self.height = int(h)/20 + except (ValueError, TypeError): + pass + for pgMar in XPath('./w:pgMar')(sectPr): + l, r = get(pgMar, 'w:left'), get(pgMar, 'w:right') + try: + self.margin_left = int(l)/20 + except (ValueError, TypeError): + pass + try: + self.margin_right = int(r)/20 + except (ValueError, TypeError): + pass + class Style(object): ''' diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index 902952ca4a..cda4c97f7e 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -15,9 +15,10 @@ from lxml.html.builder import ( from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.names import XPath, is_tag, XML, STYLES, NUMBERING, FONTS -from calibre.ebooks.docx.styles import Styles, inherit +from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.fonts import Fonts +from calibre.ebooks.docx.images import Images from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 class Text: @@ -38,6 +39,7 @@ class Convert(object): self.mi = self.docx.metadata self.body = BODY() self.styles = Styles() + self.images = Images() self.object_map = OrderedDict() self.html = HTML( HEAD( @@ -64,8 +66,12 @@ class Convert(object): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.read_styles(relationships_by_type) + self.images(relationships_by_id) self.layers = OrderedDict() - for wp in XPath('//w:p')(doc): + + self.read_page_properties(doc) + for wp, page_properties in self.page_map.iteritems(): + self.current_page = page_properties p = self.convert_p(wp) self.body.append(p) # TODO: tables child of (nested tables?) @@ -102,6 +108,25 @@ class Convert(object): html_obj.set('class', cls) self.write() + def read_page_properties(self, doc): + current = [] + self.page_map = OrderedDict() + + for p in XPath('//w:p')(doc): + sect = XPath('descendant::w:sectPr')(p) + if sect: + pr = PageProperties(sect) + for x in current + [p]: + self.page_map[x] = pr + current = [] + else: + current.append(p) + if current: + last = XPath('./w:body/w:sectPr')(doc) + pr = PageProperties(last) + for x in current: + self.page_map[x] = pr + def read_styles(self, relationships_by_type): def get_name(rtype, defname): @@ -239,6 +264,10 @@ class Convert(object): br = BR() text.add_elem(br) ans.append(text.elem) + elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'): + for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): + text.add_elem(img) + ans.append(text.elem) if text.buf: setattr(text.elem, text.attr, ''.join(text.buf)) @@ -253,3 +282,4 @@ if __name__ == '__main__': from calibre.utils.logging import default_log default_log.filter_level = default_log.DEBUG Convert(sys.argv[-1], log=default_log)() +