diff --git a/src/calibre/ebooks/docx/__init__.py b/src/calibre/ebooks/docx/__init__.py new file mode 100644 index 0000000000..f8bda2506d --- /dev/null +++ b/src/calibre/ebooks/docx/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +class InvalidDOCX(ValueError): + pass + diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py new file mode 100644 index 0000000000..efbe7b8fcb --- /dev/null +++ b/src/calibre/ebooks/docx/container.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import os, sys + +from lxml import etree + +from calibre import walk, guess_type +from calibre.ebooks.docx import InvalidDOCX +from calibre.ebooks.docx.names import DOCUMENT +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.logging import default_log +from calibre.utils.zipfile import ZipFile + +class DOCX(object): + + def __init__(self, path_or_stream, log=None): + stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') + self.name = getattr(stream, 'name', None) or '' + self.log = log or default_log + self.tdir = PersistentTemporaryDirectory('docx_container') + + self.extract(stream) + self.read_content_types() + self.read_package_relationships() + + def extract(self, stream): + try: + zf = ZipFile(stream) + zf.extractall(self.tdir) + except: + self.log.exception('DOCX appears to be invalid ZIP file, trying a' + ' more forgiving ZIP parser') + from calibre.utils.localunzip import extractall + stream.seek(0) + extractall(stream, self.tdir) + + self.names = {} + for f in walk(self.tdir): + name = os.path.relpath(f, self.tdir).replace(os.sep, '/') + self.names[name] = f + + def read(self, name): + path = self.names[name] + with open(path, 'rb') as f: + return f.read() + + def read_content_types(self): + try: + raw = self.read('[Content_Types].xml') + except KeyError: + raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) + root = etree.fromstring(raw) + self.content_types = {} + self.default_content_types = {} + for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): + self.default_content_types[item.get('Extension').lower()] = item.get('ContentType') + for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'): + name = item.get('PartName').lstrip('/') + self.content_types[name] = item.get('ContentType') + + def content_type(self, name): + if name in self.content_types: + return self.content_types[name] + ext = name.rpartition('.')[-1].lower() + if ext in self.default_content_types: + return self.default_content_types[ext] + return guess_type(name)[0] + + def read_package_relationships(self): + try: + raw = self.read('_rels/.rels') + except KeyError: + raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) + root = etree.fromstring(raw) + self.relationships = {} + self.relationships_rmap = {} + for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): + target = item.get('Target').lstrip('/') + typ = item.get('Type') + self.relationships[typ] = target + self.relationships_rmap[target] = typ + + @property + def document(self): + name = self.relationships.get(DOCUMENT, None) + if name is None: + names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) + if not names: + raise InvalidDOCX('The file %s docx file has no main document' % self.name) + name = names[0] + return etree.fromstring(self.read(name)) + +if __name__ == '__main__': + d = DOCX(sys.argv[-1]) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py new file mode 100644 index 0000000000..0a31d08ab7 --- /dev/null +++ b/src/calibre/ebooks/docx/names.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' + +namespaces = { + 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', + 'o': 'urn:schemas-microsoft-com:office:office', + 've': 'http://schemas.openxmlformats.org/markup-compatibility/2006', + # Text Content + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', + 'w10': 'urn:schemas-microsoft-com:office:word', + 'wne': 'http://schemas.microsoft.com/office/word/2006/wordml', + # Drawing + 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', + 'mv': 'urn:schemas-microsoft-com:mac:vml', + 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', + 'v': 'urn:schemas-microsoft-com:vml', + 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', + # Properties (core and extended) + 'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties', + 'dc': 'http://purl.org/dc/elements/1.1/', + 'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties', + 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', + # Content Types + 'ct': 'http://schemas.openxmlformats.org/package/2006/content-types', + # Package Relationships + 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', + 'pr': 'http://schemas.openxmlformats.org/package/2006/relationships', + # Dublin Core document properties + 'dcmitype': 'http://purl.org/dc/dcmitype/', + 'dcterms': 'http://purl.org/dc/terms/' +} +