mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use lxml to read ODT metadata
This commit is contained in:
parent
9790713949
commit
329f4d262e
@ -20,18 +20,23 @@
|
|||||||
#
|
#
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import zipfile, re, io, os
|
import io
|
||||||
|
import os
|
||||||
|
import re
|
||||||
import xml.sax.saxutils
|
import xml.sax.saxutils
|
||||||
|
|
||||||
from odf.namespaces import OFFICENS, DCNS, METANS
|
from lxml.etree import fromstring, tostring
|
||||||
from odf.opendocument import load as odLoad
|
|
||||||
from odf.draw import Image as odImage, Frame as odFrame
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn, authors_to_string
|
from calibre.ebooks.metadata import (
|
||||||
|
MetaInformation, authors_to_string, check_isbn, string_to_authors
|
||||||
from calibre.utils.imghdr import identify
|
)
|
||||||
from calibre.utils.date import parse_date
|
from calibre.utils.date import parse_date
|
||||||
|
from calibre.utils.imghdr import identify
|
||||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||||
|
from calibre.utils.zipfile import ZipFile, safe_replace
|
||||||
|
from odf.draw import Frame as odFrame, Image as odImage
|
||||||
|
from odf.namespaces import DCNS, METANS, OFFICENS
|
||||||
|
from odf.opendocument import load as odLoad
|
||||||
from polyglot.builtins import string_or_bytes
|
from polyglot.builtins import string_or_bytes
|
||||||
|
|
||||||
whitespace = re.compile(r'\s+')
|
whitespace = re.compile(r'\s+')
|
||||||
@ -160,7 +165,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
|
|||||||
|
|
||||||
|
|
||||||
def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfields={}):
|
def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfields={}):
|
||||||
zin = zipfile.ZipFile(stream, mode)
|
zin = ZipFile(stream, mode)
|
||||||
odfs = odfmetaparser(deletefields, yieldfields, addfields)
|
odfs = odfmetaparser(deletefields, yieldfields, addfields)
|
||||||
parser = xml.sax.make_parser()
|
parser = xml.sax.make_parser()
|
||||||
parser.setFeature(xml.sax.handler.feature_namespaces, True)
|
parser.setFeature(xml.sax.handler.feature_namespaces, True)
|
||||||
@ -172,25 +177,44 @@ def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfi
|
|||||||
|
|
||||||
|
|
||||||
def get_metadata(stream, extract_cover=True):
|
def get_metadata(stream, extract_cover=True):
|
||||||
zin, odfs = get_odf_meta_parsed(stream)
|
with ZipFile(stream) as zf:
|
||||||
data = odfs.seenfields
|
meta = zf.read('meta.xml')
|
||||||
|
root = fromstring(meta)
|
||||||
|
|
||||||
|
def find(field):
|
||||||
|
ns, tag = fields[field]
|
||||||
|
ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
|
||||||
|
if ans:
|
||||||
|
return tostring(ans[0], method='text', encoding='unicode', with_tail=False).strip()
|
||||||
|
|
||||||
mi = MetaInformation(None, [])
|
mi = MetaInformation(None, [])
|
||||||
if 'title' in data:
|
title = find('title')
|
||||||
mi.title = data['title']
|
if title:
|
||||||
if data.get('initial-creator', '').strip():
|
mi.title = title
|
||||||
mi.authors = string_to_authors(data['initial-creator'])
|
creator = find('initial-creator') or find('creator')
|
||||||
elif 'creator' in data:
|
if creator:
|
||||||
mi.authors = string_to_authors(data['creator'])
|
mi.authors = string_to_authors(creator)
|
||||||
if 'description' in data:
|
desc = find('description')
|
||||||
mi.comments = data['description']
|
if desc:
|
||||||
if 'language' in data:
|
mi.comments = desc
|
||||||
mi.language = data['language']
|
lang = find('language')
|
||||||
kw = data.get('keyword') or data.get('keywords')
|
if lang and canonicalize_lang(lang):
|
||||||
|
mi.languages = [canonicalize_lang(lang)]
|
||||||
|
kw = find('keyword') or find('keywords')
|
||||||
if kw:
|
if kw:
|
||||||
mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
|
mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
|
||||||
|
data = {}
|
||||||
|
for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
|
||||||
|
name = (tag.get('{%s}name' % METANS) or '').lower()
|
||||||
|
vtype = tag.get('{%s}value-type' % METANS) or 'string'
|
||||||
|
val = tag.text
|
||||||
|
if name and val:
|
||||||
|
if vtype == 'boolean':
|
||||||
|
val = val == 'true'
|
||||||
|
data[name] = val
|
||||||
opfmeta = False # we need this later for the cover
|
opfmeta = False # we need this later for the cover
|
||||||
opfnocover = False
|
opfnocover = False
|
||||||
if data.get('opf.metadata','') == 'true':
|
if data.get('opf.metadata'):
|
||||||
# custom metadata contains OPF information
|
# custom metadata contains OPF information
|
||||||
opfmeta = True
|
opfmeta = True
|
||||||
if data.get('opf.titlesort', ''):
|
if data.get('opf.titlesort', ''):
|
||||||
@ -218,10 +242,10 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
cl = canonicalize_lang(data['opf.language'])
|
cl = canonicalize_lang(data['opf.language'])
|
||||||
if cl:
|
if cl:
|
||||||
mi.languages = [cl]
|
mi.languages = [cl]
|
||||||
opfnocover = data.get('opf.nocover', 'false') == 'true'
|
opfnocover = data.get('opf.nocover', False)
|
||||||
if not opfnocover:
|
if not opfnocover:
|
||||||
try:
|
try:
|
||||||
read_cover(stream, zin, mi, opfmeta, extract_cover)
|
read_cover(stream, zf, mi, opfmeta, extract_cover)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # Do not let an error reading the cover prevent reading other data
|
pass # Do not let an error reading the cover prevent reading other data
|
||||||
|
|
||||||
@ -243,7 +267,6 @@ def get_meta_doc_props(mi):
|
|||||||
|
|
||||||
|
|
||||||
def set_metadata(stream, mi):
|
def set_metadata(stream, mi):
|
||||||
from calibre.utils.zipfile import safe_replace
|
|
||||||
metaFields = get_meta_doc_props(mi)
|
metaFields = get_meta_doc_props(mi)
|
||||||
|
|
||||||
zin, odfs = get_odf_meta_parsed(stream, addfields=metaFields, deletefields=metaFields)
|
zin, odfs = get_odf_meta_parsed(stream, addfields=metaFields, deletefields=metaFields)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user