Implement updating metadata in DOCX files

This commit is contained in:
Kovid Goyal 2015-03-15 21:37:07 +05:30
parent ca1143d043
commit 2a0ee491e9
4 changed files with 60 additions and 15 deletions

View File

@ -533,6 +533,17 @@ class TXTZMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.extz import set_metadata from calibre.ebooks.metadata.extz import set_metadata
set_metadata(stream, mi) set_metadata(stream, mi)
class DocXMetadataWriter(MetadataWriterPlugin):
name = 'Set DOCX metadata'
file_types = set(['docx'])
description = _('Read metadata from %s files')%'DOCX'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.docx import set_metadata
return set_metadata(stream, mi)
plugins += [x for x in list(locals().values()) if isinstance(x, type) and plugins += [x for x in list(locals().values()) if isinstance(x, type) and
x.__name__.endswith('MetadataWriter')] x.__name__.endswith('MetadataWriter')]

View File

@ -200,17 +200,27 @@ class DOCX(object):
return by_id, by_type return by_id, by_type
@property def get_document_properties_names(self):
def metadata(self):
mi = Metadata(_('Unknown'))
name = self.relationships.get(DOCPROPS, None) name = self.relationships.get(DOCPROPS, None)
if name is None: if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
if names: if names:
name = names[0] name = names[0]
if name: yield name
name = self.relationships.get(APPPROPS, None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names:
name = names[0]
yield name
@property
def metadata(self):
mi = Metadata(_('Unknown'))
dp_name, ap_name = self.get_document_properties_names()
if dp_name:
try: try:
raw = self.read(name) raw = self.read(dp_name)
except KeyError: except KeyError:
pass pass
else: else:
@ -223,14 +233,10 @@ class DOCX(object):
else: else:
read_default_style_language(raw, mi) read_default_style_language(raw, mi)
name = self.relationships.get(APPPROPS, None) ap_name = self.relationships.get(APPPROPS, None)
if name is None: if ap_name:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names:
name = names[0]
if name:
try: try:
raw = self.read(name) raw = self.read(ap_name)
except KeyError: except KeyError:
pass pass
else: else:

View File

@ -30,7 +30,7 @@ def xml2str(root, pretty_print=False, with_tail=False):
def update_doc_props(root, mi): def update_doc_props(root, mi):
def setm(name, text=None, ns='dc'): def setm(name, text=None, ns='dc'):
ans = root.makeelement('{%s}%s' % (namespaces[ns], name)) ans = root.makeelement('{%s}%s' % (namespaces[ns], name))
for child in root: for child in tuple(root):
if child.tag == ans.tag: if child.tag == ans.tag:
root.remove(child) root.remove(child)
ans.text = text ans.text = text

View File

@ -7,9 +7,13 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.ebooks.docx.container import DOCX from io import BytesIO
from calibre.ebooks.docx.names import XPath, get
from lxml import etree
from calibre.ebooks.docx.container import DOCX
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str, namespaces
from calibre.ebooks.docx.names import XPath, get
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
images = XPath('//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]') images = XPath('//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]')
@ -44,6 +48,30 @@ def get_metadata(stream):
return mi return mi
def set_metadata(stream, mi):
from calibre.utils.zipfile import safe_replace
c = DOCX(stream, extract=False)
dp_name, ap_name = c.get_document_properties_names()
dp_raw = c.read(dp_name)
try:
ap_raw = c.read(ap_name)
except Exception:
ap_raw = None
cp = etree.fromstring(dp_raw)
update_doc_props(cp, mi)
replacements = {}
if ap_raw is not None:
ap = etree.fromstring(ap_raw)
comp = ap.makeelement('{%s}Company' % namespaces['ep'])
for child in tuple(ap):
if child.tag == comp.tag:
ap.remove(child)
comp.text = mi.publisher
ap.append(comp)
replacements[ap_name] = BytesIO(xml2str(ap))
stream.seek(0)
safe_replace(stream, dp_name, BytesIO(xml2str(cp)), extra_replacements=replacements)
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
with open(sys.argv[-1], 'rb') as stream: with open(sys.argv[-1], 'rb') as stream: