diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 13b0b7daa5..9f0b10faa4 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -16,7 +16,7 @@ from calibre.utils.config_base import tweaks try: _author_pat = re.compile(tweaks['authors_split_regex']) except: - prints ('Author split regexp:', tweaks['authors_split_regex'], + prints('Author split regexp:', tweaks['authors_split_regex'], 'is invalid, using default') _author_pat = re.compile(r'(?i),?\s+(and|with)\s+') @@ -150,7 +150,6 @@ coding = zip( ) - def roman(num): if num <= 0 or num >= 4000 or int(num) != num: return str(num) @@ -174,6 +173,7 @@ def fmt_sidx(i, fmt='%.2f', use_roman=False): return fmt%i class Resource(object): + ''' Represents a resource (usually a file on the filesystem or a URL pointing to the web. Such resources are commonly referred to in OPF files. @@ -217,7 +217,6 @@ class Resource(object): self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) self.fragment = unquote(url[-1]) - def href(self, basedir=None): ''' Return a URL pointing to this resource. If it is a file on the filesystem @@ -240,7 +239,7 @@ class Resource(object): return ''+frag try: rpath = relpath(self.path, basedir) - except OSError: # On windows path and basedir could be on different drives + except OSError: # On windows path and basedir could be on different drives rpath = self.path if isinstance(rpath, unicode): rpath = rpath.encode('utf-8') @@ -308,7 +307,6 @@ class ResourceCollection(object): res.set_basedir(path) - def MetaInformation(title, authors=(_('Unknown'),)): ''' Convenient encapsulation of book metadata, needed for compatibility @param title: title or ``_('Unknown')`` or a MetaInformation object @@ -368,3 +366,12 @@ def format_isbn(isbn): return '-'.join((i[:2], i[2:6], i[6:9], i[9])) return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12])) +def check_doi(doi): + 'Check if something that looks like a DOI is present anywhere in the string' + if not doi: + return None + doi_check = re.search(r'10\.\d{4}/\S+', doi) + if doi_check is not None: + return doi_check.group() + return None + diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 6187fea135..4ab6ea5c36 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -9,7 +9,8 @@ from functools import partial from calibre import prints from calibre.constants import iswindows from calibre.ptempfile import TemporaryDirectory -from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn +from calibre.ebooks.metadata import ( + MetaInformation, string_to_authors, check_isbn, check_doi) from calibre.utils.ipc.simple_worker import fork_job, WorkerError #_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)') @@ -134,6 +135,18 @@ def get_metadata(stream, cover=True): if 'xmp_metadata' in info: from calibre.ebooks.metadata.xmp import consolidate_metadata mi = consolidate_metadata(mi, info['xmp_metadata']) + + # Look for recognizable identifiers in the info dict, if they were not + # found in the XMP metadata + for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): + if scheme not in mi.get_identifiers(): + for k, v in info.iteritems(): + if k != 'xmp_metadata': + val = check_func(v) + if val: + mi.set_identifier(scheme, val) + break + if cdata: mi.cover_data = ('jpeg', cdata) return mi diff --git a/src/calibre/ebooks/metadata/xmp.py b/src/calibre/ebooks/metadata/xmp.py index 365fc64a2c..3fbdcd5dc9 100644 --- a/src/calibre/ebooks/metadata/xmp.py +++ b/src/calibre/ebooks/metadata/xmp.py @@ -14,7 +14,7 @@ from lxml import etree from lxml.builder import ElementMaker from calibre import replace_entities -from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata import check_isbn, check_doi from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, isoformat, now @@ -82,7 +82,7 @@ def read_simple_property(elem): return replace_entities(elem.get(expand('rdf:resource'), '')) def read_lang_alt(parent): - # A text value with possibel alternate values in different languages + # A text value with possible alternate values in different languages items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent) if items: return items[0] @@ -114,7 +114,7 @@ def multiple_sequences(expr, root): def first_alt(expr, root): # The first element matching expr, assumes that the element contains a - # langauge alternate array + # language alternate array for item in XPath(expr)(root): q = read_simple_property(read_lang_alt(item)) if q: @@ -195,8 +195,18 @@ def metadata_from_xmp_packet(raw_bytes): scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) + elif scheme == 'doi': + val = check_doi(val) if val: identifiers[scheme] = val + + # Check Dublin Core for recognizable identifier types + for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): + if scheme not in identifiers: + val = check_func(first_simple('//dc:identifier', root)) + if val: + identifiers['doi'] = val + if identifiers: mi.set_identifiers(identifiers) @@ -360,7 +370,7 @@ def merge_xmp_packet(old, new): # First remove all data fields that are defined in the new packet from the # old packet defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES} - defined_tags |= {x.tag for x in item_xpath(new)} + defined_tags |= {x.tag for x in item_xpath(new)} | {expand('dc:identifier')} for elem in item_xpath(old): if elem.tag in defined_tags: elem.getparent().remove(elem)