Start work on supporting EPUB 3 metadata

This commit is contained in:
Kovid Goyal 2016-06-17 10:09:59 +05:30
parent 1c7012507c
commit 7773746a26
2 changed files with 206 additions and 0 deletions

View File

@ -0,0 +1,143 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from collections import defaultdict
import re
from lxml import etree
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.utils import parse_opf
from calibre.ebooks.oeb.base import OPF2_NSMAP, OPF
# Utils {{{
# http://www.idpf.org/epub/vocab/package/pfx/
reserved_prefixes = {
'dcterms': 'http://purl.org/dc/terms/',
'epubsc': 'http://idpf.org/epub/vocab/sc/#',
'marc': 'http://id.loc.gov/vocabulary/',
'media': 'http://www.idpf.org/epub/vocab/overlays/#',
'onix': 'http://www.editeur.org/ONIX/book/codelists/current.html#',
'rendition':'http://www.idpf.org/vocab/rendition/#',
'schema': 'http://schema.org/',
'xsd': 'http://www.w3.org/2001/XMLSchema#',
}
_xpath_cache = {}
_re_cache = {}
def XPath(x):
try:
return _xpath_cache[x]
except KeyError:
_xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP)
return ans
def regex(r, flags=0):
try:
return _re_cache[(r, flags)]
except KeyError:
_re_cache[(r, flags)] = ans = re.compile(r, flags)
return ans
# }}}
# Prefixes {{{
def parse_prefixes(x):
return {m.group(1):m.group(2) for m in re.finditer(r'(\S+): \s*(\S+)', x)}
def read_prefixes(root):
ans = reserved_prefixes.copy()
ans.update(parse_prefixes(root.get('prefix') or ''))
return ans
def expand_prefix(raw, prefixes):
return regex(r'(\S+)\s*:\s*(\S+)').sub(lambda m:(prefixes.get(m.group(1), m.group(1)) + ':' + m.group(2)), raw)
# }}}
# Refines {{{
def read_refines(root):
ans = defaultdict(list)
for meta in XPath('./opf:metadata/opf:meta[@refines]')(root):
r = meta.get('refines') or ''
if r.startswith('#'):
ans[r[1:]].append(meta)
return ans
# }}}
# Identifiers {{{
def parse_identifier(ident, val, refines):
idid = ident.get('id')
refines = refines[idid]
scheme = None
lval = val.lower()
def finalize(scheme, val):
if not scheme or not val:
return None, None
scheme = scheme.lower()
if scheme in ('http', 'https'):
return None, None
if scheme.startswith('isbn'):
scheme = 'isbn'
if scheme == 'isbn':
val = val.split(':')[-1]
val = check_isbn(val)
if val is None:
return None, None
return scheme, val
# Try the OPF 2 style opf:scheme attribute, which will be present, for
# example, in EPUB 3 files that have had their metadata set by an
# application that only understands EPUB 2.
scheme = ident.get(OPF('scheme'))
if scheme and not lval.startswith('urn:'):
return finalize(scheme, val)
# Technically, we should be looking for refines that define the scheme, but
# the IDioticPF created such a bad spec that they got their own
# examples wrong, so I cannot be bothered doing this.
# Parse the value for the scheme
if lval.startswith('urn:'):
val = val[4:]
prefix, rest = val.partition(':')[::2]
return finalize(prefix, rest)
def read_identifiers(root, prefixes, refines):
ans = defaultdict(list)
for ident in XPath('./opf:metadata/dc:identifier')(root):
val = (ident.text or '').strip()
if val:
scheme, val = parse_identifier(ident, val, refines)
if scheme and val:
ans[scheme].append(val)
return ans
# }}}
def read_metadata(root):
ans = Metadata(_('Unknown'), [_('Unknown')])
prefixes, refines = read_prefixes(root), read_refines(root)
identifiers = read_identifiers(root, prefixes, refines)
ids = {}
for key, vals in identifiers.iteritems():
if key == 'calibre':
ans.application_id = vals[0]
elif key != 'uuid':
ids[key] = vals[0]
ans.set_identifiers(ids)
return ans
def get_metadata(stream):
root = parse_opf(stream)
return read_metadata(root)
if __name__ == '__main__':
import sys
print(get_metadata(open(sys.argv[-1], 'rb')))

View File

@ -0,0 +1,63 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from collections import defaultdict
import unittest
from lxml import etree
from calibre.ebooks.metadata.opf3 import (
parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers, read_metadata
)
TEMPLATE = '''<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"><metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">{metadata}</metadata></package>''' # noqa
class TestOPF3(unittest.TestCase):
ae = unittest.TestCase.assertEqual
def get_opf(self, metadata=''):
return etree.fromstring(TEMPLATE.format(metadata=metadata))
def test_prefix_parsing(self):
self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'),
{'foaf':'http://xmlns.com/foaf/spec/', 'dbp': 'http://dbpedia.org/ontology/'})
for raw, expanded in (
('onix:xxx', reserved_prefixes['onix'] + ':xxx'),
('xxx:onix', 'xxx:onix'),
('xxx', 'xxx'),
):
self.ae(expand_prefix(raw, reserved_prefixes), expanded)
def test_identifiers(self):
def idt(val, scheme=None):
return '<dc:identifier {scheme}>{val}</dc:identifier>'.format(scheme=('opf:scheme="%s"'%scheme if scheme else ''), val=val)
for m, result in (
(idt('abc', 'ISBN'), {}),
(idt('isbn:9780230739581'), {'isbn':['9780230739581']}),
(idt('urn:isbn:9780230739581'), {'isbn':['9780230739581']}),
(idt('9780230739581', 'ISBN'), {'isbn':['9780230739581']}),
(idt('isbn:9780230739581', 'ISBN'), {'isbn':['9780230739581']}),
(idt('key:val'), {'key':['val']}),
(idt('url:http://x'), {'url':['http://x']}),
(idt('a:1')+idt('a:2'), {'a':['1', '2']}),
):
self.ae(result, dict(read_identifiers(self.get_opf(m), reserved_prefixes, defaultdict(list))))
mi = read_metadata(self.get_opf(
metadata=idt('a:1')+idt('a:2')+idt('calibre:x')+idt('uuid:y')))
self.ae(mi.application_id, 'x')
class TestRunner(unittest.main):
def createTests(self):
tl = unittest.TestLoader()
self.test = tl.loadTestsFromTestCase(TestOPF3)
def run(verbosity=4):
TestRunner(verbosity=verbosity, exit=False)
if __name__ == '__main__':
run(verbosity=4)