Merge branch 'pdf-meta-check' of https://github.com/sengian/calibre

Look for recognizable identifiers (ISBN/DOI) in the Info dict and
dc:identifier fields, just in case.
This commit is contained in:
Kovid Goyal 2014-02-17 06:42:05 +05:30
commit ebdf6738f3
3 changed files with 40 additions and 10 deletions

View File

@ -16,7 +16,7 @@ from calibre.utils.config_base import tweaks
try: try:
_author_pat = re.compile(tweaks['authors_split_regex']) _author_pat = re.compile(tweaks['authors_split_regex'])
except: except:
prints ('Author split regexp:', tweaks['authors_split_regex'], prints('Author split regexp:', tweaks['authors_split_regex'],
'is invalid, using default') 'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+') _author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
@ -150,7 +150,6 @@ coding = zip(
) )
def roman(num): def roman(num):
if num <= 0 or num >= 4000 or int(num) != num: if num <= 0 or num >= 4000 or int(num) != num:
return str(num) return str(num)
@ -174,6 +173,7 @@ def fmt_sidx(i, fmt='%.2f', use_roman=False):
return fmt%i return fmt%i
class Resource(object): class Resource(object):
''' '''
Represents a resource (usually a file on the filesystem or a URL pointing Represents a resource (usually a file on the filesystem or a URL pointing
to the web. Such resources are commonly referred to in OPF files. to the web. Such resources are commonly referred to in OPF files.
@ -217,7 +217,6 @@ class Resource(object):
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
self.fragment = unquote(url[-1]) self.fragment = unquote(url[-1])
def href(self, basedir=None): def href(self, basedir=None):
''' '''
Return a URL pointing to this resource. If it is a file on the filesystem Return a URL pointing to this resource. If it is a file on the filesystem
@ -308,7 +307,6 @@ class ResourceCollection(object):
res.set_basedir(path) res.set_basedir(path)
def MetaInformation(title, authors=(_('Unknown'),)): def MetaInformation(title, authors=(_('Unknown'),)):
''' Convenient encapsulation of book metadata, needed for compatibility ''' Convenient encapsulation of book metadata, needed for compatibility
@param title: title or ``_('Unknown')`` or a MetaInformation object @param title: title or ``_('Unknown')`` or a MetaInformation object
@ -368,3 +366,12 @@ def format_isbn(isbn):
return '-'.join((i[:2], i[2:6], i[6:9], i[9])) return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12])) return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
def check_doi(doi):
'Check if something that looks like a DOI is present anywhere in the string'
if not doi:
return None
doi_check = re.search(r'10\.\d{4}/\S+', doi)
if doi_check is not None:
return doi_check.group()
return None

View File

@ -9,7 +9,8 @@ from functools import partial
from calibre import prints from calibre import prints
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn from calibre.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
from calibre.utils.ipc.simple_worker import fork_job, WorkerError from calibre.utils.ipc.simple_worker import fork_job, WorkerError
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)') #_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
@ -134,6 +135,18 @@ def get_metadata(stream, cover=True):
if 'xmp_metadata' in info: if 'xmp_metadata' in info:
from calibre.ebooks.metadata.xmp import consolidate_metadata from calibre.ebooks.metadata.xmp import consolidate_metadata
mi = consolidate_metadata(mi, info['xmp_metadata']) mi = consolidate_metadata(mi, info['xmp_metadata'])
# Look for recognizable identifiers in the info dict, if they were not
# found in the XMP metadata
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
if scheme not in mi.get_identifiers():
for k, v in info.iteritems():
if k != 'xmp_metadata':
val = check_func(v)
if val:
mi.set_identifier(scheme, val)
break
if cdata: if cdata:
mi.cover_data = ('jpeg', cdata) mi.cover_data = ('jpeg', cdata)
return mi return mi

View File

@ -14,7 +14,7 @@ from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre import replace_entities from calibre import replace_entities
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn, check_doi
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import parse_date, isoformat, now from calibre.utils.date import parse_date, isoformat, now
@ -82,7 +82,7 @@ def read_simple_property(elem):
return replace_entities(elem.get(expand('rdf:resource'), '')) return replace_entities(elem.get(expand('rdf:resource'), ''))
def read_lang_alt(parent): def read_lang_alt(parent):
# A text value with possibel alternate values in different languages # A text value with possible alternate values in different languages
items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent) items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent)
if items: if items:
return items[0] return items[0]
@ -114,7 +114,7 @@ def multiple_sequences(expr, root):
def first_alt(expr, root): def first_alt(expr, root):
# The first element matching expr, assumes that the element contains a # The first element matching expr, assumes that the element contains a
# langauge alternate array # language alternate array
for item in XPath(expr)(root): for item in XPath(expr)(root):
q = read_simple_property(read_lang_alt(item)) q = read_simple_property(read_lang_alt(item))
if q: if q:
@ -195,8 +195,18 @@ def metadata_from_xmp_packet(raw_bytes):
scheme = scheme.lower() scheme = scheme.lower()
if scheme == 'isbn': if scheme == 'isbn':
val = check_isbn(val) val = check_isbn(val)
elif scheme == 'doi':
val = check_doi(val)
if val: if val:
identifiers[scheme] = val identifiers[scheme] = val
# Check Dublin Core for recognizable identifier types
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
if scheme not in identifiers:
val = check_func(first_simple('//dc:identifier', root))
if val:
identifiers['doi'] = val
if identifiers: if identifiers:
mi.set_identifiers(identifiers) mi.set_identifiers(identifiers)
@ -360,7 +370,7 @@ def merge_xmp_packet(old, new):
# First remove all data fields that are defined in the new packet from the # First remove all data fields that are defined in the new packet from the
# old packet # old packet
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES} defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
defined_tags |= {x.tag for x in item_xpath(new)} defined_tags |= {x.tag for x in item_xpath(new)} | {expand('dc:identifier')}
for elem in item_xpath(old): for elem in item_xpath(old):
if elem.tag in defined_tags: if elem.tag in defined_tags:
elem.getparent().remove(elem) elem.getparent().remove(elem)