From 7773746a2696c490f50672d4a9066a6712bd999a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Jun 2016 10:09:59 +0530 Subject: [PATCH] Start work on supporting EPUB 3 metadata --- src/calibre/ebooks/metadata/opf3.py | 143 +++++++++++++++++++++++ src/calibre/ebooks/metadata/opf3_test.py | 63 ++++++++++ 2 files changed, 206 insertions(+) create mode 100644 src/calibre/ebooks/metadata/opf3.py create mode 100644 src/calibre/ebooks/metadata/opf3_test.py diff --git a/src/calibre/ebooks/metadata/opf3.py b/src/calibre/ebooks/metadata/opf3.py new file mode 100644 index 0000000000..fc9c103ccb --- /dev/null +++ b/src/calibre/ebooks/metadata/opf3.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2016, Kovid Goyal + +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from collections import defaultdict +import re + +from lxml import etree + +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.metadata.utils import parse_opf +from calibre.ebooks.oeb.base import OPF2_NSMAP, OPF + +# Utils {{{ +# http://www.idpf.org/epub/vocab/package/pfx/ +reserved_prefixes = { + 'dcterms': 'http://purl.org/dc/terms/', + 'epubsc': 'http://idpf.org/epub/vocab/sc/#', + 'marc': 'http://id.loc.gov/vocabulary/', + 'media': 'http://www.idpf.org/epub/vocab/overlays/#', + 'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#', + 'rendition':'http://www.idpf.org/vocab/rendition/#', + 'schema': 'http://schema.org/', + 'xsd': 'http://www.w3.org/2001/XMLSchema#', +} + +_xpath_cache = {} +_re_cache = {} + +def XPath(x): + try: + return _xpath_cache[x] + except KeyError: + _xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP) + return ans + +def regex(r, flags=0): + try: + return _re_cache[(r, flags)] + except KeyError: + _re_cache[(r, flags)] = ans = re.compile(r, flags) + return ans +# }}} + +# Prefixes {{{ + +def parse_prefixes(x): + return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)} + +def read_prefixes(root): + ans = reserved_prefixes.copy() + ans.update(parse_prefixes(root.get('prefix') or '')) + return ans + +def expand_prefix(raw, prefixes): + return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw) +# }}} + +# Refines {{{ +def read_refines(root): + ans = defaultdict(list) + for meta in XPath('./opf:metadata/opf:meta[@refines]')(root): + r = meta.get('refines') or '' + if r.startswith('#'): + ans[r[1:]].append(meta) + return ans +# }}} + +# Identifiers {{{ +def parse_identifier(ident, val, refines): + idid = ident.get('id') + refines = refines[idid] + scheme = None + lval = val.lower() + + def finalize(scheme, val): + if not scheme or not val: + return None, None + scheme = scheme.lower() + if scheme in ('http', 'https'): + return None, None + if scheme.startswith('isbn'): + scheme = 'isbn' + if scheme == 'isbn': + val = val.split(':')[-1] + val = check_isbn(val) + if val is None: + return None, None + return scheme, val + + # Try the OPF 2 style opf:scheme attribute, which will be present, for + # example, in EPUB 3 files that have had their metadata set by an + # application that only understands EPUB 2. + scheme = ident.get(OPF('scheme')) + if scheme and not lval.startswith('urn:'): + return finalize(scheme, val) + + # Technically, we should be looking for refines that define the scheme, but + # the IDioticPF created such a bad spec that they got their own + # examples wrong, so I cannot be bothered doing this. + + # Parse the value for the scheme + if lval.startswith('urn:'): + val = val[4:] + + prefix, rest = val.partition(':')[::2] + return finalize(prefix, rest) + +def read_identifiers(root, prefixes, refines): + ans = defaultdict(list) + for ident in XPath('./opf:metadata/dc:identifier')(root): + val = (ident.text or '').strip() + if val: + scheme, val = parse_identifier(ident, val, refines) + if scheme and val: + ans[scheme].append(val) + return ans +# }}} + +def read_metadata(root): + ans = Metadata(_('Unknown'), [_('Unknown')]) + prefixes, refines = read_prefixes(root), read_refines(root) + identifiers = read_identifiers(root, prefixes, refines) + ids = {} + for key, vals in identifiers.iteritems(): + if key == 'calibre': + ans.application_id = vals[0] + elif key != 'uuid': + ids[key] = vals[0] + ans.set_identifiers(ids) + + return ans + +def get_metadata(stream): + root = parse_opf(stream) + return read_metadata(root) + +if __name__ == '__main__': + import sys + print(get_metadata(open(sys.argv[-1], 'rb'))) diff --git a/src/calibre/ebooks/metadata/opf3_test.py b/src/calibre/ebooks/metadata/opf3_test.py new file mode 100644 index 0000000000..3173bf7e5b --- /dev/null +++ b/src/calibre/ebooks/metadata/opf3_test.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2016, Kovid Goyal + +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from collections import defaultdict +import unittest + +from lxml import etree + +from calibre.ebooks.metadata.opf3 import ( + parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers, read_metadata +) + +TEMPLATE = '''{metadata}''' # noqa + +class TestOPF3(unittest.TestCase): + + ae = unittest.TestCase.assertEqual + + def get_opf(self, metadata=''): + return etree.fromstring(TEMPLATE.format(metadata=metadata)) + + def test_prefix_parsing(self): + self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'), + {'foaf':'http://xmlns.com/foaf/spec/', 'dbp': 'http://dbpedia.org/ontology/'}) + for raw, expanded in ( + ('onix:xxx', reserved_prefixes['onix'] + ':xxx'), + ('xxx:onix', 'xxx:onix'), + ('xxx', 'xxx'), + ): + self.ae(expand_prefix(raw, reserved_prefixes), expanded) + + def test_identifiers(self): + def idt(val, scheme=None): + return '{val}'.format(scheme=('opf:scheme="%s"'%scheme if scheme else ''), val=val) + for m, result in ( + (idt('abc', 'ISBN'), {}), + (idt('isbn:9780230739581'), {'isbn':['9780230739581']}), + (idt('urn:isbn:9780230739581'), {'isbn':['9780230739581']}), + (idt('9780230739581', 'ISBN'), {'isbn':['9780230739581']}), + (idt('isbn:9780230739581', 'ISBN'), {'isbn':['9780230739581']}), + (idt('key:val'), {'key':['val']}), + (idt('url:http://x'), {'url':['http://x']}), + (idt('a:1')+idt('a:2'), {'a':['1', '2']}), + ): + self.ae(result, dict(read_identifiers(self.get_opf(m), reserved_prefixes, defaultdict(list)))) + mi = read_metadata(self.get_opf( + metadata=idt('a:1')+idt('a:2')+idt('calibre:x')+idt('uuid:y'))) + self.ae(mi.application_id, 'x') + +class TestRunner(unittest.main): + + def createTests(self): + tl = unittest.TestLoader() + self.test = tl.loadTestsFromTestCase(TestOPF3) + +def run(verbosity=4): + TestRunner(verbosity=verbosity, exit=False) + +if __name__ == '__main__': + run(verbosity=4)