Start work on docx input plugin

2025-08-11 09:13:57 -04:00 · 2013-05-04 07:46:41 +05:30 · 2013-05-04 07:46:41 +05:30 · 8f08d9446d
commit 8f08d9446d
parent 42989d47a3
3 changed files with 151 additions and 0 deletions
--- a/src/calibre/ebooks/docx/init.py
+++ b/src/calibre/ebooks/docx/init.py
@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+class InvalidDOCX(ValueError):
+    pass
+
--- a/src/calibre/ebooks/docx/container.py
+++ b/src/calibre/ebooks/docx/container.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os, sys
+
+from lxml import etree
+
+from calibre import walk, guess_type
+from calibre.ebooks.docx import InvalidDOCX
+from calibre.ebooks.docx.names import DOCUMENT
+from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.logging import default_log
+from calibre.utils.zipfile import ZipFile
+
+class DOCX(object):
+
+    def __init__(self, path_or_stream, log=None):
+        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
+        self.name = getattr(stream, 'name', None) or '<stream>'
+        self.log = log or default_log
+        self.tdir = PersistentTemporaryDirectory('docx_container')
+
+        self.extract(stream)
+        self.read_content_types()
+        self.read_package_relationships()
+
+    def extract(self, stream):
+        try:
+            zf = ZipFile(stream)
+            zf.extractall(self.tdir)
+        except:
+            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
+                    ' more forgiving ZIP parser')
+            from calibre.utils.localunzip import extractall
+            stream.seek(0)
+            extractall(stream, self.tdir)
+
+        self.names = {}
+        for f in walk(self.tdir):
+            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
+            self.names[name] = f
+
+    def read(self, name):
+        path = self.names[name]
+        with open(path, 'rb') as f:
+            return f.read()
+
+    def read_content_types(self):
+        try:
+            raw = self.read('[Content_Types].xml')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
+        root = etree.fromstring(raw)
+        self.content_types = {}
+        self.default_content_types = {}
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
+            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
+        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
+            name = item.get('PartName').lstrip('/')
+            self.content_types[name] = item.get('ContentType')
+
+    def content_type(self, name):
+        if name in self.content_types:
+            return self.content_types[name]
+        ext = name.rpartition('.')[-1].lower()
+        if ext in self.default_content_types:
+            return self.default_content_types[ext]
+        return guess_type(name)[0]
+
+    def read_package_relationships(self):
+        try:
+            raw = self.read('_rels/.rels')
+        except KeyError:
+            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
+        root = etree.fromstring(raw)
+        self.relationships = {}
+        self.relationships_rmap = {}
+        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
+            target = item.get('Target').lstrip('/')
+            typ = item.get('Type')
+            self.relationships[typ] = target
+            self.relationships_rmap[target] = typ
+
+    @property
+    def document(self):
+        name = self.relationships.get(DOCUMENT, None)
+        if name is None:
+            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
+            if not names:
+                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
+            name = names[0]
+        return etree.fromstring(self.read(name))
+
+if __name__ == '__main__':
+    d = DOCX(sys.argv[-1])
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
+
+namespaces = {
+    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
+    'o': 'urn:schemas-microsoft-com:office:office',
+    've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    # Text Content
+    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+    'w10': 'urn:schemas-microsoft-com:office:word',
+    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+    # Drawing
+    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
+    'mv': 'urn:schemas-microsoft-com:mac:vml',
+    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
+    'v': 'urn:schemas-microsoft-com:vml',
+    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+    # Properties (core and extended)
+    'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
+    'dc': 'http://purl.org/dc/elements/1.1/',
+    'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
+    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+    # Content Types
+    'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
+    # Package Relationships
+    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
+    # Dublin Core document properties
+    'dcmitype': 'http://purl.org/dc/dcmitype/',
+    'dcterms': 'http://purl.org/dc/terms/'
+}
+