Use lxml to read ODT metadata

2025-07-09 03:04:10 -04:00 · 2019-09-17 10:26:19 +05:30 · 2019-09-17 10:26:19 +05:30 · 329f4d262e
commit 329f4d262e
parent 9790713949
1 changed files with 84 additions and 61 deletions
--- a/src/calibre/ebooks/metadata/odt.py
+++ b/src/calibre/ebooks/metadata/odt.py
@ -20,18 +20,23 @@
 #
 from __future__ import absolute_import, division, print_function, unicode_literals
-import zipfile, re, io, os
+import io
 import os
 import re
 import xml.sax.saxutils
-from odf.namespaces import OFFICENS, DCNS, METANS
+from lxml.etree import fromstring, tostring
 from odf.opendocument import load as odLoad
 from odf.draw import Image as odImage, Frame as odFrame
-from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn, authors_to_string
+from calibre.ebooks.metadata import (
-
+    MetaInformation, authors_to_string, check_isbn, string_to_authors
-from calibre.utils.imghdr import identify
+)
 from calibre.utils.date import parse_date
 from calibre.utils.imghdr import identify
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
 from calibre.utils.zipfile import ZipFile, safe_replace
 from odf.draw import Frame as odFrame, Image as odImage
 from odf.namespaces import DCNS, METANS, OFFICENS
 from odf.opendocument import load as odLoad
 from polyglot.builtins import string_or_bytes
 whitespace = re.compile(r'\s+')
@ -160,7 +165,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
 def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfields={}):
-    zin = zipfile.ZipFile(stream, mode)
+    zin = ZipFile(stream, mode)
    odfs = odfmetaparser(deletefields, yieldfields, addfields)
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, True)
@ -172,58 +177,77 @@ def get_odf_meta_parsed(stream, mode='r', deletefields={}, yieldfields={}, addfi
 def get_metadata(stream, extract_cover=True):
-    zin, odfs = get_odf_meta_parsed(stream)
+    with ZipFile(stream) as zf:
-    data = odfs.seenfields
+        meta = zf.read('meta.xml')
-    mi = MetaInformation(None, [])
+        root = fromstring(meta)
-    if 'title' in data:
+
-        mi.title = data['title']
+        def find(field):
-    if data.get('initial-creator', '').strip():
+            ns, tag = fields[field]
-        mi.authors = string_to_authors(data['initial-creator'])
+            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
-    elif 'creator' in data:
+            if ans:
-        mi.authors = string_to_authors(data['creator'])
+                return tostring(ans[0], method='text', encoding='unicode', with_tail=False).strip()
-    if 'description' in data:
+
-        mi.comments = data['description']
+        mi = MetaInformation(None, [])
-    if 'language' in data:
+        title = find('title')
-        mi.language = data['language']
+        if title:
-    kw = data.get('keyword') or data.get('keywords')
+            mi.title = title
-    if kw:
+        creator = find('initial-creator') or find('creator')
-        mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
+        if creator:
-    opfmeta = False  # we need this later for the cover
+            mi.authors = string_to_authors(creator)
-    opfnocover = False
+        desc = find('description')
-    if data.get('opf.metadata','') == 'true':
+        if desc:
-        # custom metadata contains OPF information
+            mi.comments = desc
-        opfmeta = True
+        lang = find('language')
-        if data.get('opf.titlesort', ''):
+        if lang and canonicalize_lang(lang):
-            mi.title_sort = data['opf.titlesort']
+            mi.languages = [canonicalize_lang(lang)]
-        if data.get('opf.authors', ''):
+        kw = find('keyword') or find('keywords')
-            mi.authors = string_to_authors(data['opf.authors'])
+        if kw:
-        if data.get('opf.authorsort', ''):
+            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
-            mi.author_sort = data['opf.authorsort']
+        data = {}
-        if data.get('opf.isbn', ''):
+        for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
-            isbn = check_isbn(data['opf.isbn'])
+            name = (tag.get('{%s}name' % METANS) or '').lower()
-            if isbn is not None:
+            vtype = tag.get('{%s}value-type' % METANS) or 'string'
-                mi.isbn = isbn
+            val = tag.text
-        if data.get('opf.publisher', ''):
+            if name and val:
-            mi.publisher = data['opf.publisher']
+                if vtype == 'boolean':
-        if data.get('opf.pubdate', ''):
+                    val = val == 'true'
-            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
+                data[name] = val
-        if data.get('opf.series', ''):
+        opfmeta = False  # we need this later for the cover
-            mi.series = data['opf.series']
+        opfnocover = False
-            if data.get('opf.seriesindex', ''):
+        if data.get('opf.metadata'):
-                try:
+            # custom metadata contains OPF information
-                    mi.series_index = float(data['opf.seriesindex'])
+            opfmeta = True
-                except Exception:
+            if data.get('opf.titlesort', ''):
-                    mi.series_index = 1.0
+                mi.title_sort = data['opf.titlesort']
-        if data.get('opf.language', ''):
+            if data.get('opf.authors', ''):
-            cl = canonicalize_lang(data['opf.language'])
+                mi.authors = string_to_authors(data['opf.authors'])
-            if cl:
+            if data.get('opf.authorsort', ''):
-                mi.languages = [cl]
+                mi.author_sort = data['opf.authorsort']
-        opfnocover = data.get('opf.nocover', 'false') == 'true'
+            if data.get('opf.isbn', ''):
-    if not opfnocover:
+                isbn = check_isbn(data['opf.isbn'])
-        try:
+                if isbn is not None:
-            read_cover(stream, zin, mi, opfmeta, extract_cover)
+                    mi.isbn = isbn
-        except Exception:
+            if data.get('opf.publisher', ''):
-            pass  # Do not let an error reading the cover prevent reading other data
+                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data
    return mi
@ -243,7 +267,6 @@ def get_meta_doc_props(mi):
 def set_metadata(stream, mi):
    from calibre.utils.zipfile import safe_replace
    metaFields = get_meta_doc_props(mi)
    zin, odfs = get_odf_meta_parsed(stream, addfields=metaFields, deletefields=metaFields)