Refactor DOCX metadata reading to use the container class

This commit is contained in:
Kovid Goyal 2013-05-04 12:32:06 +05:30
parent df6f0f8dc7
commit 22f95c8678
4 changed files with 168 additions and 76 deletions

View File

@ -6,30 +6,90 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys import os, sys, shutil
from lxml import etree from lxml import etree
from calibre import walk, guess_type from calibre import walk, guess_type
from calibre.ebooks.metadata import string_to_authors
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.docx import InvalidDOCX from calibre.ebooks.docx import InvalidDOCX
from calibre.ebooks.docx.names import DOCUMENT from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
def fromstring(raw, parser=RECOVER_PARSER):
return etree.fromstring(raw, parser=parser)
# Read metadata {{{
def read_doc_props(raw, mi):
root = fromstring(raw)
titles = XPath('//dc:title')(root)
if titles:
title = titles[0].text
if title and title.strip():
mi.title = title.strip()
tags = []
for subject in XPath('//dc:subject')(root):
if subject.text and subject.text.strip():
tags.append(subject.text.strip().replace(',', '_'))
for keywords in XPath('//cp:keywords')(root):
if keywords.text and keywords.text.strip():
for x in keywords.text.split():
tags.extend(y.strip() for y in x.split(','))
if tags:
mi.tags = tags
authors = XPath('//dc:creator')(root)
aut = []
for author in authors:
if author.text and author.text.strip():
aut.extend(string_to_authors(author.text))
if aut:
mi.authors = aut
desc = XPath('//dc:description')(root)
if desc:
raw = etree.tostring(desc[0], method='text', encoding=unicode)
mi.comments = raw
langs = []
for lang in XPath('//dc:language')(root):
if lang.text and lang.text.strip():
l = canonicalize_lang(lang.text)
if l:
langs.append(l)
if langs:
mi.languages = langs
def read_app_props(raw, mi):
root = fromstring(raw)
company = root.xpath('//*[local-name()="Company"]')
if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip()
# }}}
class DOCX(object): class DOCX(object):
def __init__(self, path_or_stream, log=None): def __init__(self, path_or_stream, log=None, extract=True):
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
self.name = getattr(stream, 'name', None) or '<stream>' self.name = getattr(stream, 'name', None) or '<stream>'
self.log = log or default_log self.log = log or default_log
self.tdir = PersistentTemporaryDirectory('docx_container') if extract:
self.extract(stream)
self.extract(stream) else:
self.init_zipfile(stream)
self.read_content_types() self.read_content_types()
self.read_package_relationships() self.read_package_relationships()
def init_zipfile(self, stream):
self.zipf = ZipFile(stream)
self.names = frozenset(self.zipf.namelist())
def extract(self, stream): def extract(self, stream):
self.tdir = PersistentTemporaryDirectory('docx_container')
try: try:
zf = ZipFile(stream) zf = ZipFile(stream)
zf.extractall(self.tdir) zf.extractall(self.tdir)
@ -46,6 +106,8 @@ class DOCX(object):
self.names[name] = f self.names[name] = f
def read(self, name): def read(self, name):
if hasattr(self, 'zipf'):
return self.zipf.open(name).read()
path = self.names[name] path = self.names[name]
with open(path, 'rb') as f: with open(path, 'rb') as f:
return f.read() return f.read()
@ -55,7 +117,7 @@ class DOCX(object):
raw = self.read('[Content_Types].xml') raw = self.read('[Content_Types].xml')
except KeyError: except KeyError:
raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
root = etree.fromstring(raw) root = fromstring(raw)
self.content_types = {} self.content_types = {}
self.default_content_types = {} self.default_content_types = {}
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
@ -77,7 +139,7 @@ class DOCX(object):
raw = self.read('_rels/.rels') raw = self.read('_rels/.rels')
except KeyError: except KeyError:
raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
root = etree.fromstring(raw) root = fromstring(raw)
self.relationships = {} self.relationships = {}
self.relationships_rmap = {} self.relationships_rmap = {}
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
@ -94,7 +156,48 @@ class DOCX(object):
if not names: if not names:
raise InvalidDOCX('The file %s docx file has no main document' % self.name) raise InvalidDOCX('The file %s docx file has no main document' % self.name)
name = names[0] name = names[0]
return etree.fromstring(self.read(name)) return fromstring(self.read(name))
@property
def metadata(self):
mi = Metadata(_('Unknown'))
name = self.relationships.get(DOCPROPS, None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
if names:
name = names[0]
if name:
try:
raw = self.read(name)
except KeyError:
pass
else:
read_doc_props(raw, mi)
name = self.relationships.get(APPPROPS, None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names:
name = names[0]
if name:
try:
raw = self.read(name)
except KeyError:
pass
else:
read_app_props(raw, mi)
return mi
def close(self):
if hasattr(self, 'zipf'):
self.zipf.close()
else:
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
pass
if __name__ == '__main__': if __name__ == '__main__':
d = DOCX(sys.argv[-1]) d = DOCX(sys.argv[-1], extract=False)
print (d.metadata)

View File

@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml.etree import XPath as X
DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
namespaces = { namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
@ -38,3 +42,6 @@ namespaces = {
'dcterms': 'http://purl.org/dc/terms/' 'dcterms': 'http://purl.org/dc/terms/'
} }
def XPath(expr):
return X(expr, namespaces=namespaces)

View File

@ -0,0 +1,41 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os
from lxml import html
from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
from calibre.ebooks.docx.container import Container
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None):
self.container = Container(path_or_stream, log=log)
self.log = self.container.log
self.dest_dir = dest_dir or os.getcwdu()
self.body = BODY()
self.html = HTML(
HEAD(
META(charset='utf-8'),
TITLE('TODO: read from metadata'),
LINK(rel='stylesheet', type='text/css', href='docx.css'),
),
self.body
)
def __call__(self):
self.write()
def write(self):
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
if __name__ == '__main__':
Convert(sys.argv[-1])()

View File

@ -7,80 +7,21 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from lxml import etree from calibre.ebooks.docx.container import DOCX
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
from calibre.ebooks.oeb.base import DC11_NS
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
NSMAP = {'dc':DC11_NS,
'cp':'http://schemas.openxmlformats.org/package/2006/metadata/core-properties'}
def XPath(expr):
return etree.XPath(expr, namespaces=NSMAP)
def _read_doc_props(raw, mi):
from calibre.ebooks.metadata import string_to_authors
root = etree.fromstring(raw, parser=RECOVER_PARSER)
titles = XPath('//dc:title')(root)
if titles:
title = titles[0].text
if title and title.strip():
mi.title = title.strip()
tags = []
for subject in XPath('//dc:subject')(root):
if subject.text and subject.text.strip():
tags.append(subject.text.strip().replace(',', '_'))
for keywords in XPath('//cp:keywords')(root):
if keywords.text and keywords.text.strip():
for x in keywords.text.split():
tags.extend(y.strip() for y in x.split(','))
if tags:
mi.tags = tags
authors = XPath('//dc:creator')(root)
aut = []
for author in authors:
if author.text and author.text.strip():
aut.extend(string_to_authors(author.text))
if aut:
mi.authors = aut
desc = XPath('//dc:description')(root)
if desc:
raw = etree.tostring(desc[0], method='text', encoding=unicode)
mi.comments = raw
langs = []
for lang in XPath('//dc:language')(root):
if lang.text and lang.text.strip():
l = canonicalize_lang(lang.text)
if l:
langs.append(l)
if langs:
mi.languages = langs
def _read_app_props(raw, mi):
root = etree.fromstring(raw, parser=RECOVER_PARSER)
company = root.xpath('//*[local-name()="Company"]')
if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip()
def get_metadata(stream): def get_metadata(stream):
c = DOCX(stream, extract=False)
mi = c.metadata
c.close()
stream.seek(0)
cdata = None
with ZipFile(stream, 'r') as zf: with ZipFile(stream, 'r') as zf:
mi = Metadata(_('Unknown'))
cdata = None
for zi in zf.infolist(): for zi in zf.infolist():
ext = zi.filename.rpartition('.')[-1].lower() ext = zi.filename.rpartition('.')[-1].lower()
if zi.filename.lower() == 'docprops/core.xml': if cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
_read_doc_props(zf.read(zi), mi)
elif zi.filename.lower() == 'docprops/app.xml':
_read_app_props(zf.read(zi), mi)
elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
raw = zf.read(zi) raw = zf.read(zi)
try: try:
width, height, fmt = identify_data(raw) width, height, fmt = identify_data(raw)