Use lxml to read ODT metadata

This commit is contained in:
Kovid Goyal 2019-09-17 10:26:19 +05:30
parent 9790713949
commit 329f4d262e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -20,18 +20,23 @@
# #
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import zipfile, re, io, os import io
import os
import re
import xml.sax.saxutils import xml.sax.saxutils
from odf.namespaces import OFFICENS, DCNS, METANS from lxml.etree import fromstring, tostring
from odf.opendocument import load as odLoad
from odf.draw import Image as odImage, Frame as odFrame
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn, authors_to_string from calibre.ebooks.metadata import (
MetaInformation, authors_to_string, check_isbn, string_to_authors
from calibre.utils.imghdr import identify )
from calibre.utils.date import parse_date from calibre.utils.date import parse_date
from calibre.utils.imghdr import identify
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from calibre.utils.zipfile import ZipFile, safe_replace
from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import DCNS, METANS, OFFICENS
from odf.opendocument import load as odLoad
from polyglot.builtins import string_or_bytes from polyglot.builtins import string_or_bytes
whitespace = re.compile(r'\s+') whitespace = re.compile(r'\s+')
@ -160,7 +165,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfields={}): def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfields={}):
zin = zipfile.ZipFile(stream, mode) zin = ZipFile(stream, mode)
odfs = odfmetaparser(deletefields, yieldfields, addfields) odfs = odfmetaparser(deletefields, yieldfields, addfields)
parser = xml.sax.make_parser() parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, True) parser.setFeature(xml.sax.handler.feature_namespaces, True)
@ -172,58 +177,77 @@ def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfi
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
zin, odfs = get_odf_meta_parsed(stream) with ZipFile(stream) as zf:
data = odfs.seenfields meta = zf.read('meta.xml')
mi = MetaInformation(None, []) root = fromstring(meta)
if 'title' in data:
mi.title = data['title'] def find(field):
if data.get('initial-creator', '').strip(): ns, tag = fields[field]
mi.authors = string_to_authors(data['initial-creator']) ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
elif 'creator' in data: if ans:
mi.authors = string_to_authors(data['creator']) return tostring(ans[0], method='text', encoding='unicode', with_tail=False).strip()
if 'description' in data:
mi.comments = data['description'] mi = MetaInformation(None, [])
if 'language' in data: title = find('title')
mi.language = data['language'] if title:
kw = data.get('keyword') or data.get('keywords') mi.title = title
if kw: creator = find('initial-creator') or find('creator')
mi.tags = [x.strip() for x in kw.split(',') if x.strip()] if creator:
opfmeta = False # we need this later for the cover mi.authors = string_to_authors(creator)
opfnocover = False desc = find('description')
if data.get('opf.metadata','') == 'true': if desc:
# custom metadata contains OPF information mi.comments = desc
opfmeta = True lang = find('language')
if data.get('opf.titlesort', ''): if lang and canonicalize_lang(lang):
mi.title_sort = data['opf.titlesort'] mi.languages = [canonicalize_lang(lang)]
if data.get('opf.authors', ''): kw = find('keyword') or find('keywords')
mi.authors = string_to_authors(data['opf.authors']) if kw:
if data.get('opf.authorsort', ''): mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
mi.author_sort = data['opf.authorsort'] data = {}
if data.get('opf.isbn', ''): for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
isbn = check_isbn(data['opf.isbn']) name = (tag.get('{%s}name' % METANS) or '').lower()
if isbn is not None: vtype = tag.get('{%s}value-type' % METANS) or 'string'
mi.isbn = isbn val = tag.text
if data.get('opf.publisher', ''): if name and val:
mi.publisher = data['opf.publisher'] if vtype == 'boolean':
if data.get('opf.pubdate', ''): val = val == 'true'
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) data[name] = val
if data.get('opf.series', ''): opfmeta = False # we need this later for the cover
mi.series = data['opf.series'] opfnocover = False
if data.get('opf.seriesindex', ''): if data.get('opf.metadata'):
try: # custom metadata contains OPF information
mi.series_index = float(data['opf.seriesindex']) opfmeta = True
except Exception: if data.get('opf.titlesort', ''):
mi.series_index = 1.0 mi.title_sort = data['opf.titlesort']
if data.get('opf.language', ''): if data.get('opf.authors', ''):
cl = canonicalize_lang(data['opf.language']) mi.authors = string_to_authors(data['opf.authors'])
if cl: if data.get('opf.authorsort', ''):
mi.languages = [cl] mi.author_sort = data['opf.authorsort']
opfnocover = data.get('opf.nocover', 'false') == 'true' if data.get('opf.isbn', ''):
if not opfnocover: isbn = check_isbn(data['opf.isbn'])
try: if isbn is not None:
read_cover(stream, zin, mi, opfmeta, extract_cover) mi.isbn = isbn
except Exception: if data.get('opf.publisher', ''):
pass # Do not let an error reading the cover prevent reading other data mi.publisher = data['opf.publisher']
if data.get('opf.pubdate', ''):
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
if data.get('opf.series', ''):
mi.series = data['opf.series']
if data.get('opf.seriesindex', ''):
try:
mi.series_index = float(data['opf.seriesindex'])
except Exception:
mi.series_index = 1.0
if data.get('opf.language', ''):
cl = canonicalize_lang(data['opf.language'])
if cl:
mi.languages = [cl]
opfnocover = data.get('opf.nocover', False)
if not opfnocover:
try:
read_cover(stream, zf, mi, opfmeta, extract_cover)
except Exception:
pass # Do not let an error reading the cover prevent reading other data
return mi return mi
@ -243,7 +267,6 @@ def get_meta_doc_props(mi):
def set_metadata(stream, mi): def set_metadata(stream, mi):
from calibre.utils.zipfile import safe_replace
metaFields = get_meta_doc_props(mi) metaFields = get_meta_doc_props(mi)
zin, odfs = get_odf_meta_parsed(stream, addfields=metaFields, deletefields=metaFields) zin, odfs = get_odf_meta_parsed(stream, addfields=metaFields, deletefields=metaFields)