Merge branch 'pdf-meta-check' of https://github.com/sengian/calibre

Look for recognizable identifiers (ISBN/DOI) in the Info dict and
dc:identifier fields, just in case.
This commit is contained in:
Kovid Goyal 2014-02-17 06:42:05 +05:30
commit ebdf6738f3
3 changed files with 40 additions and 10 deletions

View File

@ -16,7 +16,7 @@ from calibre.utils.config_base import tweaks
try:
_author_pat = re.compile(tweaks['authors_split_regex'])
except:
prints ('Author split regexp:', tweaks['authors_split_regex'],
prints('Author split regexp:', tweaks['authors_split_regex'],
'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
@ -150,7 +150,6 @@ coding = zip(
)
def roman(num):
if num <= 0 or num >= 4000 or int(num) != num:
return str(num)
@ -174,6 +173,7 @@ def fmt_sidx(i, fmt='%.2f', use_roman=False):
return fmt%i
class Resource(object):
'''
Represents a resource (usually a file on the filesystem or a URL pointing
to the web. Such resources are commonly referred to in OPF files.
@ -217,7 +217,6 @@ class Resource(object):
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
self.fragment = unquote(url[-1])
def href(self, basedir=None):
'''
Return a URL pointing to this resource. If it is a file on the filesystem
@ -308,7 +307,6 @@ class ResourceCollection(object):
res.set_basedir(path)
def MetaInformation(title, authors=(_('Unknown'),)):
''' Convenient encapsulation of book metadata, needed for compatibility
@param title: title or ``_('Unknown')`` or a MetaInformation object
@ -368,3 +366,12 @@ def format_isbn(isbn):
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
def check_doi(doi):
'Check if something that looks like a DOI is present anywhere in the string'
if not doi:
return None
doi_check = re.search(r'10\.\d{4}/\S+', doi)
if doi_check is not None:
return doi_check.group()
return None

View File

@ -9,7 +9,8 @@ from functools import partial
from calibre import prints
from calibre.constants import iswindows
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
from calibre.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
@ -134,6 +135,18 @@ def get_metadata(stream, cover=True):
if 'xmp_metadata' in info:
from calibre.ebooks.metadata.xmp import consolidate_metadata
mi = consolidate_metadata(mi, info['xmp_metadata'])
# Look for recognizable identifiers in the info dict, if they were not
# found in the XMP metadata
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
if scheme not in mi.get_identifiers():
for k, v in info.iteritems():
if k != 'xmp_metadata':
val = check_func(v)
if val:
mi.set_identifier(scheme, val)
break
if cdata:
mi.cover_data = ('jpeg', cdata)
return mi

View File

@ -14,7 +14,7 @@ from lxml import etree
from lxml.builder import ElementMaker
from calibre import replace_entities
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata import check_isbn, check_doi
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import parse_date, isoformat, now
@ -82,7 +82,7 @@ def read_simple_property(elem):
return replace_entities(elem.get(expand('rdf:resource'), ''))
def read_lang_alt(parent):
# A text value with possibel alternate values in different languages
# A text value with possible alternate values in different languages
items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent)
if items:
return items[0]
@ -114,7 +114,7 @@ def multiple_sequences(expr, root):
def first_alt(expr, root):
# The first element matching expr, assumes that the element contains a
# langauge alternate array
# language alternate array
for item in XPath(expr)(root):
q = read_simple_property(read_lang_alt(item))
if q:
@ -195,8 +195,18 @@ def metadata_from_xmp_packet(raw_bytes):
scheme = scheme.lower()
if scheme == 'isbn':
val = check_isbn(val)
elif scheme == 'doi':
val = check_doi(val)
if val:
identifiers[scheme] = val
# Check Dublin Core for recognizable identifier types
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
if scheme not in identifiers:
val = check_func(first_simple('//dc:identifier', root))
if val:
identifiers['doi'] = val
if identifiers:
mi.set_identifiers(identifiers)
@ -360,7 +370,7 @@ def merge_xmp_packet(old, new):
# First remove all data fields that are defined in the new packet from the
# old packet
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
defined_tags |= {x.tag for x in item_xpath(new)}
defined_tags |= {x.tag for x in item_xpath(new)} | {expand('dc:identifier')}
for elem in item_xpath(old):
if elem.tag in defined_tags:
elem.getparent().remove(elem)