From edfbb3c7bc9d6c1feaa05c4e6eb7b43079ffc628 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 11 May 2017 14:01:47 +0530 Subject: [PATCH] Recognize http URLs in identifiers with arbitrary keys --- .../ebooks/metadata/sources/identify.py | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 9267b1db57..dd2d7d92dd 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -533,6 +533,12 @@ def identify(log, abort, # {{{ def urls_from_identifiers(identifiers): # {{{ identifiers = {k.lower():v for k, v in identifiers.iteritems()} ans = [] + keys_left = set(identifiers) + + def add(name, k, val, url): + ans.append((name, k, val, url)) + keys_left.discard(k) + rules = msprefs['id_link_rules'] if rules: formatter = EvalFormatter() @@ -546,40 +552,49 @@ def urls_from_identifiers(identifiers): # {{{ import traceback traceback.format_exc() continue - ans.append((name, k, val, url)) + add(name, k, val, url) for plugin in all_metadata_plugins(): try: for id_type, id_val, url in plugin.get_book_urls(identifiers): - ans.append((plugin.get_book_url_name(id_type, id_val, url), id_type, id_val, url)) - except: + add(plugin.get_book_url_name(id_type, id_val, url), id_type, id_val, url) + except Exception: pass isbn = identifiers.get('isbn', None) if isbn: - ans.append((isbn, 'isbn', isbn, - 'https://www.worldcat.org/isbn/'+isbn)) + add(isbn, 'isbn', isbn, + 'https://www.worldcat.org/isbn/'+isbn) doi = identifiers.get('doi', None) if doi: - ans.append(('DOI', 'doi', doi, - 'https://dx.doi.org/'+doi)) + add('DOI', 'doi', doi, + 'https://dx.doi.org/'+doi) arxiv = identifiers.get('arxiv', None) if arxiv: - ans.append(('arXiv', 'arxiv', arxiv, - 'https://arxiv.org/abs/'+arxiv)) + add('arXiv', 'arxiv', arxiv, + 'https://arxiv.org/abs/'+arxiv) oclc = identifiers.get('oclc', None) if oclc: - ans.append(('OCLC', 'oclc', oclc, - 'https://www.worldcat.org/oclc/'+oclc)) + add('OCLC', 'oclc', oclc, + 'https://www.worldcat.org/oclc/'+oclc) issn = check_issn(identifiers.get('issn', None)) if issn: - ans.append((issn, 'issn', issn, - 'https://www.worldcat.org/issn/'+issn)) + add(issn, 'issn', issn, + 'https://www.worldcat.org/issn/'+issn) + q = {'http', 'https', 'file'} for k, url in identifiers.iteritems(): if url and re.match(r'ur[il]\d*$', k) is not None: url = url[:8].replace('|', ':') + url[8:].replace('|', ',') - if url.partition(':')[0].lower() in {'http', 'file', 'https'}: + if url.partition(':')[0].lower() in q: parts = urlparse(url) name = parts.netloc or parts.path - ans.append((name, k, url, url)) + add(name, k, url, url) + for k in tuple(keys_left): + val = identifiers.get(k) + if val: + url = val[:8].replace('|', ':') + val[8:].replace('|', ',') + if url.partition(':')[0].lower() in q: + parts = urlparse(url) + name = parts.netloc or parts.path + add(name, k, url, url) return ans # }}}