Merge branch 'pdf-meta-check' of https://github.com/sengian/calibre

Look for recognizable identifiers (ISBN/DOI) in the Info dict and dc:identifier fields, just in case.
2025-11-22 22:43:02 -05:00 · 2014-02-17 06:42:05 +05:30 · 2014-02-17 06:42:05 +05:30 · ebdf6738f3
commit ebdf6738f3
parent 434e36a8a7 f6ed89ddb7
3 changed files with 40 additions and 10 deletions
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -16,7 +16,7 @@ from calibre.utils.config_base import tweaks
 try:
    _author_pat = re.compile(tweaks['authors_split_regex'])
 except:
-    prints ('Author split regexp:', tweaks['authors_split_regex'],
+    prints('Author split regexp:', tweaks['authors_split_regex'],
            'is invalid, using default')
    _author_pat = re.compile(r'(?i),?\s+(and|with)\s+')

@ -150,7 +150,6 @@ coding = zip(
 )


-
 def roman(num):
    if num <= 0 or num >= 4000 or int(num) != num:
        return str(num)
@ -174,6 +173,7 @@ def fmt_sidx(i, fmt='%.2f', use_roman=False):
    return fmt%i

 class Resource(object):
+
    '''
    Represents a resource (usually a file on the filesystem or a URL pointing
    to the web. Such resources are commonly referred to in OPF files.
@ -217,7 +217,6 @@ class Resource(object):
                self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
                self.fragment = unquote(url[-1])

-
    def href(self, basedir=None):
        '''
        Return a URL pointing to this resource. If it is a file on the filesystem
@ -308,7 +307,6 @@ class ResourceCollection(object):
            res.set_basedir(path)


-
 def MetaInformation(title, authors=(_('Unknown'),)):
    ''' Convenient encapsulation of book metadata, needed for compatibility
        @param title: title or ``_('Unknown')`` or a MetaInformation object
@ -368,3 +366,12 @@ def format_isbn(isbn):
        return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
    return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))

+def check_doi(doi):
+    'Check if something that looks like a DOI is present anywhere in the string'
+    if not doi:
+        return None
+    doi_check = re.search(r'10\.\d{4}/\S+', doi)
+    if doi_check is not None:
+        return doi_check.group()
+    return None
+
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@ -9,7 +9,8 @@ from functools import partial
 from calibre import prints
 from calibre.constants import iswindows
 from calibre.ptempfile import TemporaryDirectory
-from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
+from calibre.ebooks.metadata import (
+    MetaInformation, string_to_authors, check_isbn, check_doi)
 from calibre.utils.ipc.simple_worker import fork_job, WorkerError

 #_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
@ -134,6 +135,18 @@ def get_metadata(stream, cover=True):
    if 'xmp_metadata' in info:
        from calibre.ebooks.metadata.xmp import consolidate_metadata
        mi = consolidate_metadata(mi, info['xmp_metadata'])
+
+    # Look for recognizable identifiers in the info dict, if they were not
+    # found in the XMP metadata
+    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
+        if scheme not in mi.get_identifiers():
+            for k, v in info.iteritems():
+                if k != 'xmp_metadata':
+                    val = check_func(v)
+                    if val:
+                        mi.set_identifier(scheme, val)
+                        break
+
    if cdata:
        mi.cover_data = ('jpeg', cdata)
    return mi
--- a/src/calibre/ebooks/metadata/xmp.py
+++ b/src/calibre/ebooks/metadata/xmp.py
@ -14,7 +14,7 @@ from lxml import etree
 from lxml.builder import ElementMaker

 from calibre import replace_entities
-from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata import check_isbn, check_doi
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.date import parse_date, isoformat, now

@ -82,7 +82,7 @@ def read_simple_property(elem):
    return replace_entities(elem.get(expand('rdf:resource'), ''))

 def read_lang_alt(parent):
-    # A text value with possibel alternate values in different languages
+    # A text value with possible alternate values in different languages
    items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent)
    if items:
        return items[0]
@ -114,7 +114,7 @@ def multiple_sequences(expr, root):

 def first_alt(expr, root):
    # The first element matching expr, assumes that the element contains a
-    # langauge alternate array
+    # language alternate array
    for item in XPath(expr)(root):
        q = read_simple_property(read_lang_alt(item))
        if q:
@ -195,8 +195,18 @@ def metadata_from_xmp_packet(raw_bytes):
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
+                elif scheme == 'doi':
+                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val
+
+    # Check Dublin Core for recognizable identifier types
+    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
+        if scheme not in identifiers:
+            val = check_func(first_simple('//dc:identifier', root))
+            if val:
+                identifiers['doi'] = val
+
    if identifiers:
        mi.set_identifiers(identifiers)

@ -360,7 +370,7 @@ def merge_xmp_packet(old, new):
    # First remove all data fields that are defined in the new packet from the
    # old packet
    defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
-    defined_tags |= {x.tag for x in item_xpath(new)}
+    defined_tags |= {x.tag for x in item_xpath(new)} | {expand('dc:identifier')}
    for elem in item_xpath(old):
        if elem.tag in defined_tags:
            elem.getparent().remove(elem)