From 22f95c8678b130b54a9898fad009f0a103012afc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 4 May 2013 12:32:06 +0530 Subject: [PATCH] Refactor DOCX metadata reading to use the container class --- src/calibre/ebooks/docx/container.py | 123 ++++++++++++++++++++++++--- src/calibre/ebooks/docx/names.py | 7 ++ src/calibre/ebooks/docx/to_html.py | 41 +++++++++ src/calibre/ebooks/metadata/docx.py | 73 ++-------------- 4 files changed, 168 insertions(+), 76 deletions(-) create mode 100644 src/calibre/ebooks/docx/to_html.py diff --git a/src/calibre/ebooks/docx/container.py b/src/calibre/ebooks/docx/container.py index efbe7b8fcb..cae22e086c 100644 --- a/src/calibre/ebooks/docx/container.py +++ b/src/calibre/ebooks/docx/container.py @@ -6,30 +6,90 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import os, sys +import os, sys, shutil from lxml import etree from calibre import walk, guess_type +from calibre.ebooks.metadata import string_to_authors +from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.docx import InvalidDOCX -from calibre.ebooks.docx.names import DOCUMENT +from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.localization import canonicalize_lang from calibre.utils.logging import default_log from calibre.utils.zipfile import ZipFile +from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER + +def fromstring(raw, parser=RECOVER_PARSER): + return etree.fromstring(raw, parser=parser) + +# Read metadata {{{ +def read_doc_props(raw, mi): + root = fromstring(raw) + titles = XPath('//dc:title')(root) + if titles: + title = titles[0].text + if title and title.strip(): + mi.title = title.strip() + tags = [] + for subject in XPath('//dc:subject')(root): + if subject.text and subject.text.strip(): + tags.append(subject.text.strip().replace(',', '_')) + for keywords in XPath('//cp:keywords')(root): + if keywords.text and keywords.text.strip(): + for x in keywords.text.split(): + tags.extend(y.strip() for y in x.split(',')) + if tags: + mi.tags = tags + authors = XPath('//dc:creator')(root) + aut = [] + for author in authors: + if author.text and author.text.strip(): + aut.extend(string_to_authors(author.text)) + if aut: + mi.authors = aut + + desc = XPath('//dc:description')(root) + if desc: + raw = etree.tostring(desc[0], method='text', encoding=unicode) + mi.comments = raw + + langs = [] + for lang in XPath('//dc:language')(root): + if lang.text and lang.text.strip(): + l = canonicalize_lang(lang.text) + if l: + langs.append(l) + if langs: + mi.languages = langs + +def read_app_props(raw, mi): + root = fromstring(raw) + company = root.xpath('//*[local-name()="Company"]') + if company and company[0].text and company[0].text.strip(): + mi.publisher = company[0].text.strip() +# }}} class DOCX(object): - def __init__(self, path_or_stream, log=None): + def __init__(self, path_or_stream, log=None, extract=True): stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') self.name = getattr(stream, 'name', None) or '' self.log = log or default_log - self.tdir = PersistentTemporaryDirectory('docx_container') - - self.extract(stream) + if extract: + self.extract(stream) + else: + self.init_zipfile(stream) self.read_content_types() self.read_package_relationships() + def init_zipfile(self, stream): + self.zipf = ZipFile(stream) + self.names = frozenset(self.zipf.namelist()) + def extract(self, stream): + self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) @@ -46,6 +106,8 @@ class DOCX(object): self.names[name] = f def read(self, name): + if hasattr(self, 'zipf'): + return self.zipf.open(name).read() path = self.names[name] with open(path, 'rb') as f: return f.read() @@ -55,7 +117,7 @@ class DOCX(object): raw = self.read('[Content_Types].xml') except KeyError: raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) - root = etree.fromstring(raw) + root = fromstring(raw) self.content_types = {} self.default_content_types = {} for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): @@ -77,7 +139,7 @@ class DOCX(object): raw = self.read('_rels/.rels') except KeyError: raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) - root = etree.fromstring(raw) + root = fromstring(raw) self.relationships = {} self.relationships_rmap = {} for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): @@ -94,7 +156,48 @@ class DOCX(object): if not names: raise InvalidDOCX('The file %s docx file has no main document' % self.name) name = names[0] - return etree.fromstring(self.read(name)) + return fromstring(self.read(name)) + + @property + def metadata(self): + mi = Metadata(_('Unknown')) + name = self.relationships.get(DOCPROPS, None) + if name is None: + names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') + if names: + name = names[0] + if name: + try: + raw = self.read(name) + except KeyError: + pass + else: + read_doc_props(raw, mi) + + name = self.relationships.get(APPPROPS, None) + if name is None: + names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') + if names: + name = names[0] + if name: + try: + raw = self.read(name) + except KeyError: + pass + else: + read_app_props(raw, mi) + + return mi + + def close(self): + if hasattr(self, 'zipf'): + self.zipf.close() + else: + try: + shutil.rmtree(self.tdir) + except EnvironmentError: + pass if __name__ == '__main__': - d = DOCX(sys.argv[-1]) + d = DOCX(sys.argv[-1], extract=False) + print (d.metadata) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 0a31d08ab7..9080377b36 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +from lxml.etree import XPath as X + DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' +DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' +APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', @@ -38,3 +42,6 @@ namespaces = { 'dcterms': 'http://purl.org/dc/terms/' } +def XPath(expr): + return X(expr, namespaces=namespaces) + diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py new file mode 100644 index 0000000000..b2a5de4691 --- /dev/null +++ b/src/calibre/ebooks/docx/to_html.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import sys, os + +from lxml import html +from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META) + +from calibre.ebooks.docx.container import Container + +class Convert(object): + + def __init__(self, path_or_stream, dest_dir=None, log=None): + self.container = Container(path_or_stream, log=log) + self.log = self.container.log + self.dest_dir = dest_dir or os.getcwdu() + self.body = BODY() + self.html = HTML( + HEAD( + META(charset='utf-8'), + TITLE('TODO: read from metadata'), + LINK(rel='stylesheet', type='text/css', href='docx.css'), + ), + self.body + ) + + def __call__(self): + self.write() + + def write(self): + raw = html.tostring(self.html, encoding='utf-8', doctype='') + with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: + f.write(raw) + +if __name__ == '__main__': + Convert(sys.argv[-1])() diff --git a/src/calibre/ebooks/metadata/docx.py b/src/calibre/ebooks/metadata/docx.py index cb265424cc..31b0c48974 100644 --- a/src/calibre/ebooks/metadata/docx.py +++ b/src/calibre/ebooks/metadata/docx.py @@ -7,80 +7,21 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from lxml import etree +from calibre.ebooks.docx.container import DOCX -from calibre.ebooks.metadata.book.base import Metadata -from calibre.utils.localization import canonicalize_lang from calibre.utils.zipfile import ZipFile from calibre.utils.magick.draw import identify_data -from calibre.ebooks.oeb.base import DC11_NS -from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER - -NSMAP = {'dc':DC11_NS, -'cp':'http://schemas.openxmlformats.org/package/2006/metadata/core-properties'} - -def XPath(expr): - return etree.XPath(expr, namespaces=NSMAP) - -def _read_doc_props(raw, mi): - from calibre.ebooks.metadata import string_to_authors - root = etree.fromstring(raw, parser=RECOVER_PARSER) - titles = XPath('//dc:title')(root) - if titles: - title = titles[0].text - if title and title.strip(): - mi.title = title.strip() - tags = [] - for subject in XPath('//dc:subject')(root): - if subject.text and subject.text.strip(): - tags.append(subject.text.strip().replace(',', '_')) - for keywords in XPath('//cp:keywords')(root): - if keywords.text and keywords.text.strip(): - for x in keywords.text.split(): - tags.extend(y.strip() for y in x.split(',')) - if tags: - mi.tags = tags - authors = XPath('//dc:creator')(root) - aut = [] - for author in authors: - if author.text and author.text.strip(): - aut.extend(string_to_authors(author.text)) - if aut: - mi.authors = aut - - desc = XPath('//dc:description')(root) - if desc: - raw = etree.tostring(desc[0], method='text', encoding=unicode) - mi.comments = raw - - langs = [] - for lang in XPath('//dc:language')(root): - if lang.text and lang.text.strip(): - l = canonicalize_lang(lang.text) - if l: - langs.append(l) - if langs: - mi.languages = langs - -def _read_app_props(raw, mi): - root = etree.fromstring(raw, parser=RECOVER_PARSER) - company = root.xpath('//*[local-name()="Company"]') - if company and company[0].text and company[0].text.strip(): - mi.publisher = company[0].text.strip() def get_metadata(stream): + c = DOCX(stream, extract=False) + mi = c.metadata + c.close() + stream.seek(0) + cdata = None with ZipFile(stream, 'r') as zf: - - mi = Metadata(_('Unknown')) - cdata = None - for zi in zf.infolist(): ext = zi.filename.rpartition('.')[-1].lower() - if zi.filename.lower() == 'docprops/core.xml': - _read_doc_props(zf.read(zi), mi) - elif zi.filename.lower() == 'docprops/app.xml': - _read_app_props(zf.read(zi), mi) - elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}: + if cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}: raw = zf.read(zi) try: width, height, fmt = identify_data(raw)