mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Refactor DOCX metadata reading to use the container class
This commit is contained in:
parent
df6f0f8dc7
commit
22f95c8678
@ -6,30 +6,90 @@ from __future__ import (unicode_literals, division, absolute_import,
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, sys
|
||||
import os, sys, shutil
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import walk, guess_type
|
||||
from calibre.ebooks.metadata import string_to_authors
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.docx import InvalidDOCX
|
||||
from calibre.ebooks.docx.names import DOCUMENT
|
||||
from calibre.ebooks.docx.names import DOCUMENT, DOCPROPS, XPath, APPPROPS
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
||||
|
||||
def fromstring(raw, parser=RECOVER_PARSER):
|
||||
return etree.fromstring(raw, parser=parser)
|
||||
|
||||
# Read metadata {{{
|
||||
def read_doc_props(raw, mi):
|
||||
root = fromstring(raw)
|
||||
titles = XPath('//dc:title')(root)
|
||||
if titles:
|
||||
title = titles[0].text
|
||||
if title and title.strip():
|
||||
mi.title = title.strip()
|
||||
tags = []
|
||||
for subject in XPath('//dc:subject')(root):
|
||||
if subject.text and subject.text.strip():
|
||||
tags.append(subject.text.strip().replace(',', '_'))
|
||||
for keywords in XPath('//cp:keywords')(root):
|
||||
if keywords.text and keywords.text.strip():
|
||||
for x in keywords.text.split():
|
||||
tags.extend(y.strip() for y in x.split(','))
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
authors = XPath('//dc:creator')(root)
|
||||
aut = []
|
||||
for author in authors:
|
||||
if author.text and author.text.strip():
|
||||
aut.extend(string_to_authors(author.text))
|
||||
if aut:
|
||||
mi.authors = aut
|
||||
|
||||
desc = XPath('//dc:description')(root)
|
||||
if desc:
|
||||
raw = etree.tostring(desc[0], method='text', encoding=unicode)
|
||||
mi.comments = raw
|
||||
|
||||
langs = []
|
||||
for lang in XPath('//dc:language')(root):
|
||||
if lang.text and lang.text.strip():
|
||||
l = canonicalize_lang(lang.text)
|
||||
if l:
|
||||
langs.append(l)
|
||||
if langs:
|
||||
mi.languages = langs
|
||||
|
||||
def read_app_props(raw, mi):
|
||||
root = fromstring(raw)
|
||||
company = root.xpath('//*[local-name()="Company"]')
|
||||
if company and company[0].text and company[0].text.strip():
|
||||
mi.publisher = company[0].text.strip()
|
||||
# }}}
|
||||
|
||||
class DOCX(object):
|
||||
|
||||
def __init__(self, path_or_stream, log=None):
|
||||
def __init__(self, path_or_stream, log=None, extract=True):
|
||||
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
|
||||
self.name = getattr(stream, 'name', None) or '<stream>'
|
||||
self.log = log or default_log
|
||||
self.tdir = PersistentTemporaryDirectory('docx_container')
|
||||
|
||||
self.extract(stream)
|
||||
if extract:
|
||||
self.extract(stream)
|
||||
else:
|
||||
self.init_zipfile(stream)
|
||||
self.read_content_types()
|
||||
self.read_package_relationships()
|
||||
|
||||
def init_zipfile(self, stream):
|
||||
self.zipf = ZipFile(stream)
|
||||
self.names = frozenset(self.zipf.namelist())
|
||||
|
||||
def extract(self, stream):
|
||||
self.tdir = PersistentTemporaryDirectory('docx_container')
|
||||
try:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(self.tdir)
|
||||
@ -46,6 +106,8 @@ class DOCX(object):
|
||||
self.names[name] = f
|
||||
|
||||
def read(self, name):
|
||||
if hasattr(self, 'zipf'):
|
||||
return self.zipf.open(name).read()
|
||||
path = self.names[name]
|
||||
with open(path, 'rb') as f:
|
||||
return f.read()
|
||||
@ -55,7 +117,7 @@ class DOCX(object):
|
||||
raw = self.read('[Content_Types].xml')
|
||||
except KeyError:
|
||||
raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
|
||||
root = etree.fromstring(raw)
|
||||
root = fromstring(raw)
|
||||
self.content_types = {}
|
||||
self.default_content_types = {}
|
||||
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
|
||||
@ -77,7 +139,7 @@ class DOCX(object):
|
||||
raw = self.read('_rels/.rels')
|
||||
except KeyError:
|
||||
raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
|
||||
root = etree.fromstring(raw)
|
||||
root = fromstring(raw)
|
||||
self.relationships = {}
|
||||
self.relationships_rmap = {}
|
||||
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
|
||||
@ -94,7 +156,48 @@ class DOCX(object):
|
||||
if not names:
|
||||
raise InvalidDOCX('The file %s docx file has no main document' % self.name)
|
||||
name = names[0]
|
||||
return etree.fromstring(self.read(name))
|
||||
return fromstring(self.read(name))
|
||||
|
||||
@property
|
||||
def metadata(self):
|
||||
mi = Metadata(_('Unknown'))
|
||||
name = self.relationships.get(DOCPROPS, None)
|
||||
if name is None:
|
||||
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
|
||||
if names:
|
||||
name = names[0]
|
||||
if name:
|
||||
try:
|
||||
raw = self.read(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
read_doc_props(raw, mi)
|
||||
|
||||
name = self.relationships.get(APPPROPS, None)
|
||||
if name is None:
|
||||
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
|
||||
if names:
|
||||
name = names[0]
|
||||
if name:
|
||||
try:
|
||||
raw = self.read(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
read_app_props(raw, mi)
|
||||
|
||||
return mi
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, 'zipf'):
|
||||
self.zipf.close()
|
||||
else:
|
||||
try:
|
||||
shutil.rmtree(self.tdir)
|
||||
except EnvironmentError:
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
d = DOCX(sys.argv[-1])
|
||||
d = DOCX(sys.argv[-1], extract=False)
|
||||
print (d.metadata)
|
||||
|
@ -6,7 +6,11 @@ from __future__ import (unicode_literals, division, absolute_import,
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from lxml.etree import XPath as X
|
||||
|
||||
DOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
|
||||
DOCPROPS = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
|
||||
APPPROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'
|
||||
|
||||
namespaces = {
|
||||
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
|
||||
@ -38,3 +42,6 @@ namespaces = {
|
||||
'dcterms': 'http://purl.org/dc/terms/'
|
||||
}
|
||||
|
||||
def XPath(expr):
|
||||
return X(expr, namespaces=namespaces)
|
||||
|
||||
|
41
src/calibre/ebooks/docx/to_html.py
Normal file
41
src/calibre/ebooks/docx/to_html.py
Normal file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import sys, os
|
||||
|
||||
from lxml import html
|
||||
from lxml.html.builder import (HTML, HEAD, TITLE, BODY, LINK, META)
|
||||
|
||||
from calibre.ebooks.docx.container import Container
|
||||
|
||||
class Convert(object):
|
||||
|
||||
def __init__(self, path_or_stream, dest_dir=None, log=None):
|
||||
self.container = Container(path_or_stream, log=log)
|
||||
self.log = self.container.log
|
||||
self.dest_dir = dest_dir or os.getcwdu()
|
||||
self.body = BODY()
|
||||
self.html = HTML(
|
||||
HEAD(
|
||||
META(charset='utf-8'),
|
||||
TITLE('TODO: read from metadata'),
|
||||
LINK(rel='stylesheet', type='text/css', href='docx.css'),
|
||||
),
|
||||
self.body
|
||||
)
|
||||
|
||||
def __call__(self):
|
||||
self.write()
|
||||
|
||||
def write(self):
|
||||
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
|
||||
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
if __name__ == '__main__':
|
||||
Convert(sys.argv[-1])()
|
@ -7,80 +7,21 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from lxml import etree
|
||||
from calibre.ebooks.docx.container import DOCX
|
||||
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
from calibre.ebooks.oeb.base import DC11_NS
|
||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
||||
|
||||
NSMAP = {'dc':DC11_NS,
|
||||
'cp':'http://schemas.openxmlformats.org/package/2006/metadata/core-properties'}
|
||||
|
||||
def XPath(expr):
|
||||
return etree.XPath(expr, namespaces=NSMAP)
|
||||
|
||||
def _read_doc_props(raw, mi):
|
||||
from calibre.ebooks.metadata import string_to_authors
|
||||
root = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||
titles = XPath('//dc:title')(root)
|
||||
if titles:
|
||||
title = titles[0].text
|
||||
if title and title.strip():
|
||||
mi.title = title.strip()
|
||||
tags = []
|
||||
for subject in XPath('//dc:subject')(root):
|
||||
if subject.text and subject.text.strip():
|
||||
tags.append(subject.text.strip().replace(',', '_'))
|
||||
for keywords in XPath('//cp:keywords')(root):
|
||||
if keywords.text and keywords.text.strip():
|
||||
for x in keywords.text.split():
|
||||
tags.extend(y.strip() for y in x.split(','))
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
authors = XPath('//dc:creator')(root)
|
||||
aut = []
|
||||
for author in authors:
|
||||
if author.text and author.text.strip():
|
||||
aut.extend(string_to_authors(author.text))
|
||||
if aut:
|
||||
mi.authors = aut
|
||||
|
||||
desc = XPath('//dc:description')(root)
|
||||
if desc:
|
||||
raw = etree.tostring(desc[0], method='text', encoding=unicode)
|
||||
mi.comments = raw
|
||||
|
||||
langs = []
|
||||
for lang in XPath('//dc:language')(root):
|
||||
if lang.text and lang.text.strip():
|
||||
l = canonicalize_lang(lang.text)
|
||||
if l:
|
||||
langs.append(l)
|
||||
if langs:
|
||||
mi.languages = langs
|
||||
|
||||
def _read_app_props(raw, mi):
|
||||
root = etree.fromstring(raw, parser=RECOVER_PARSER)
|
||||
company = root.xpath('//*[local-name()="Company"]')
|
||||
if company and company[0].text and company[0].text.strip():
|
||||
mi.publisher = company[0].text.strip()
|
||||
|
||||
def get_metadata(stream):
|
||||
c = DOCX(stream, extract=False)
|
||||
mi = c.metadata
|
||||
c.close()
|
||||
stream.seek(0)
|
||||
cdata = None
|
||||
with ZipFile(stream, 'r') as zf:
|
||||
|
||||
mi = Metadata(_('Unknown'))
|
||||
cdata = None
|
||||
|
||||
for zi in zf.infolist():
|
||||
ext = zi.filename.rpartition('.')[-1].lower()
|
||||
if zi.filename.lower() == 'docprops/core.xml':
|
||||
_read_doc_props(zf.read(zi), mi)
|
||||
elif zi.filename.lower() == 'docprops/app.xml':
|
||||
_read_app_props(zf.read(zi), mi)
|
||||
elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
|
||||
if cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
|
||||
raw = zf.read(zi)
|
||||
try:
|
||||
width, height, fmt = identify_data(raw)
|
||||
|
Loading…
x
Reference in New Issue
Block a user