From 39d399b4b7a689dd379c56cc4e28afe065154046 Mon Sep 17 00:00:00 2001 From: sengian Date: Sun, 16 Feb 2014 21:27:21 +0100 Subject: [PATCH 1/4] Add a DOI check function to metadata --- src/calibre/ebooks/metadata/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 13b0b7daa5..8f8904554d 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -368,3 +368,11 @@ def format_isbn(isbn): return '-'.join((i[:2], i[2:6], i[6:9], i[9])) return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12])) +def check_doi(doi): + if not doi: + return None + doi_check = re.search(r'10\.\d{4}/\S+', doi) + if doi_check is not None: + return doi_check.group() + return None + From c68c3b6da299140d5384f98d2120ca9f9c721380 Mon Sep 17 00:00:00 2001 From: sengian Date: Sun, 16 Feb 2014 21:35:37 +0100 Subject: [PATCH 2/4] Add dc:identifier check for DOI & doi validation --- src/calibre/ebooks/metadata/xmp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/xmp.py b/src/calibre/ebooks/metadata/xmp.py index 365fc64a2c..d9d828acd6 100644 --- a/src/calibre/ebooks/metadata/xmp.py +++ b/src/calibre/ebooks/metadata/xmp.py @@ -13,8 +13,9 @@ from collections import defaultdict from lxml import etree from lxml.builder import ElementMaker +from calibre import prints from calibre import replace_entities -from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata import check_isbn, check_doi from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, isoformat, now @@ -38,7 +39,7 @@ NS_MAP = { 'x': 'adobe:ns:meta/', 'calibre': 'http://calibre-ebook.com/xmp-namespace', } -KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'} +KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi', 'identifier'} def expand(name): prefix, name = name.partition(':')[::2] @@ -187,14 +188,17 @@ def metadata_from_xmp_packet(raw_bytes): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value + prints(repr(identifiers)) - for namespace in ('prism', 'pdfx'): + for namespace in ('prism', 'pdfx', 'dc'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) + elif scheme == 'doi' or scheme == 'identifier': + val = check_doi(val) if val: identifiers[scheme] = val if identifiers: From 7b3c699c7a71031c34ca60d2119841d20c1c2a35 Mon Sep 17 00:00:00 2001 From: sengian Date: Sun, 16 Feb 2014 21:52:48 +0100 Subject: [PATCH 3/4] ... --- src/calibre/ebooks/metadata/xmp.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/metadata/xmp.py b/src/calibre/ebooks/metadata/xmp.py index d9d828acd6..80cf71fc58 100644 --- a/src/calibre/ebooks/metadata/xmp.py +++ b/src/calibre/ebooks/metadata/xmp.py @@ -39,7 +39,7 @@ NS_MAP = { 'x': 'adobe:ns:meta/', 'calibre': 'http://calibre-ebook.com/xmp-namespace', } -KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi', 'identifier'} +KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'} def expand(name): prefix, name = name.partition(':')[::2] @@ -83,7 +83,7 @@ def read_simple_property(elem): return replace_entities(elem.get(expand('rdf:resource'), '')) def read_lang_alt(parent): - # A text value with possibel alternate values in different languages + # A text value with possible alternate values in different languages items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent) if items: return items[0] @@ -115,7 +115,7 @@ def multiple_sequences(expr, root): def first_alt(expr, root): # The first element matching expr, assumes that the element contains a - # langauge alternate array + # language alternate array for item in XPath(expr)(root): q = read_simple_property(read_lang_alt(item)) if q: @@ -188,19 +188,26 @@ def metadata_from_xmp_packet(raw_bytes): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value - prints(repr(identifiers)) - for namespace in ('prism', 'pdfx', 'dc'): + for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) - elif scheme == 'doi' or scheme == 'identifier': + elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val + + # Check Dublin Core for identifier, only DOI considered + if 'doi' not in identifiers: + val = first_simple('//dc:identifier', root) + val = check_doi(val) + if val: + identifiers['doi'] = val + if identifiers: mi.set_identifiers(identifiers) From f6ed89ddb746905866a6dd4532b55e5cc6c62f18 Mon Sep 17 00:00:00 2001 From: sengian Date: Sun, 16 Feb 2014 22:12:48 +0100 Subject: [PATCH 4/4] Check doi in usual info fields --- src/calibre/ebooks/metadata/pdf.py | 12 +++++++++++- src/calibre/ebooks/metadata/xmp.py | 1 - 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 6187fea135..59055887b1 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -9,7 +9,8 @@ from functools import partial from calibre import prints from calibre.constants import iswindows from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn +from calibre.ebooks.metadata import MetaInformation, string_to_authors, \ + check_isbn, check_doi from calibre.utils.ipc.simple_worker import fork_job, WorkerError #_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)') @@ -134,6 +135,15 @@ def get_metadata(stream, cover=True): if 'xmp_metadata' in info: from calibre.ebooks.metadata.xmp import consolidate_metadata mi = consolidate_metadata(mi, info['xmp_metadata']) + + if 'doi' not in mi.get_identifiers().iterkeys(): + for k,v in info.iteritems(): + if k != 'xmp_metadata': + doi = check_doi(v) + if doi: + mi.set_identifiers({'doi':doi}) + break + if cdata: mi.cover_data = ('jpeg', cdata) return mi diff --git a/src/calibre/ebooks/metadata/xmp.py b/src/calibre/ebooks/metadata/xmp.py index 80cf71fc58..98953a31bb 100644 --- a/src/calibre/ebooks/metadata/xmp.py +++ b/src/calibre/ebooks/metadata/xmp.py @@ -13,7 +13,6 @@ from collections import defaultdict from lxml import etree from lxml.builder import ElementMaker -from calibre import prints from calibre import replace_entities from calibre.ebooks.metadata import check_isbn, check_doi from calibre.ebooks.metadata.book.base import Metadata