mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Merge branch 'pdf-meta-check' of https://github.com/sengian/calibre
Look for recognizable identifiers (ISBN/DOI) in the Info dict and dc:identifier fields, just in case.
This commit is contained in:
commit
ebdf6738f3
@ -150,7 +150,6 @@ coding = zip(
|
||||
)
|
||||
|
||||
|
||||
|
||||
def roman(num):
|
||||
if num <= 0 or num >= 4000 or int(num) != num:
|
||||
return str(num)
|
||||
@ -174,6 +173,7 @@ def fmt_sidx(i, fmt='%.2f', use_roman=False):
|
||||
return fmt%i
|
||||
|
||||
class Resource(object):
|
||||
|
||||
'''
|
||||
Represents a resource (usually a file on the filesystem or a URL pointing
|
||||
to the web. Such resources are commonly referred to in OPF files.
|
||||
@ -217,7 +217,6 @@ class Resource(object):
|
||||
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
|
||||
self.fragment = unquote(url[-1])
|
||||
|
||||
|
||||
def href(self, basedir=None):
|
||||
'''
|
||||
Return a URL pointing to this resource. If it is a file on the filesystem
|
||||
@ -308,7 +307,6 @@ class ResourceCollection(object):
|
||||
res.set_basedir(path)
|
||||
|
||||
|
||||
|
||||
def MetaInformation(title, authors=(_('Unknown'),)):
|
||||
''' Convenient encapsulation of book metadata, needed for compatibility
|
||||
@param title: title or ``_('Unknown')`` or a MetaInformation object
|
||||
@ -368,3 +366,12 @@ def format_isbn(isbn):
|
||||
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
|
||||
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
|
||||
|
||||
def check_doi(doi):
|
||||
'Check if something that looks like a DOI is present anywhere in the string'
|
||||
if not doi:
|
||||
return None
|
||||
doi_check = re.search(r'10\.\d{4}/\S+', doi)
|
||||
if doi_check is not None:
|
||||
return doi_check.group()
|
||||
return None
|
||||
|
||||
|
@ -9,7 +9,8 @@ from functools import partial
|
||||
from calibre import prints
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
|
||||
from calibre.ebooks.metadata import (
|
||||
MetaInformation, string_to_authors, check_isbn, check_doi)
|
||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
|
||||
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
|
||||
@ -134,6 +135,18 @@ def get_metadata(stream, cover=True):
|
||||
if 'xmp_metadata' in info:
|
||||
from calibre.ebooks.metadata.xmp import consolidate_metadata
|
||||
mi = consolidate_metadata(mi, info['xmp_metadata'])
|
||||
|
||||
# Look for recognizable identifiers in the info dict, if they were not
|
||||
# found in the XMP metadata
|
||||
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
|
||||
if scheme not in mi.get_identifiers():
|
||||
for k, v in info.iteritems():
|
||||
if k != 'xmp_metadata':
|
||||
val = check_func(v)
|
||||
if val:
|
||||
mi.set_identifier(scheme, val)
|
||||
break
|
||||
|
||||
if cdata:
|
||||
mi.cover_data = ('jpeg', cdata)
|
||||
return mi
|
||||
|
@ -14,7 +14,7 @@ from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from calibre import replace_entities
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata import check_isbn, check_doi
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.utils.date import parse_date, isoformat, now
|
||||
|
||||
@ -82,7 +82,7 @@ def read_simple_property(elem):
|
||||
return replace_entities(elem.get(expand('rdf:resource'), ''))
|
||||
|
||||
def read_lang_alt(parent):
|
||||
# A text value with possibel alternate values in different languages
|
||||
# A text value with possible alternate values in different languages
|
||||
items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent)
|
||||
if items:
|
||||
return items[0]
|
||||
@ -114,7 +114,7 @@ def multiple_sequences(expr, root):
|
||||
|
||||
def first_alt(expr, root):
|
||||
# The first element matching expr, assumes that the element contains a
|
||||
# langauge alternate array
|
||||
# language alternate array
|
||||
for item in XPath(expr)(root):
|
||||
q = read_simple_property(read_lang_alt(item))
|
||||
if q:
|
||||
@ -195,8 +195,18 @@ def metadata_from_xmp_packet(raw_bytes):
|
||||
scheme = scheme.lower()
|
||||
if scheme == 'isbn':
|
||||
val = check_isbn(val)
|
||||
elif scheme == 'doi':
|
||||
val = check_doi(val)
|
||||
if val:
|
||||
identifiers[scheme] = val
|
||||
|
||||
# Check Dublin Core for recognizable identifier types
|
||||
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
|
||||
if scheme not in identifiers:
|
||||
val = check_func(first_simple('//dc:identifier', root))
|
||||
if val:
|
||||
identifiers['doi'] = val
|
||||
|
||||
if identifiers:
|
||||
mi.set_identifiers(identifiers)
|
||||
|
||||
@ -360,7 +370,7 @@ def merge_xmp_packet(old, new):
|
||||
# First remove all data fields that are defined in the new packet from the
|
||||
# old packet
|
||||
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
|
||||
defined_tags |= {x.tag for x in item_xpath(new)}
|
||||
defined_tags |= {x.tag for x in item_xpath(new)} | {expand('dc:identifier')}
|
||||
for elem in item_xpath(old):
|
||||
if elem.tag in defined_tags:
|
||||
elem.getparent().remove(elem)
|
||||
|
Loading…
x
Reference in New Issue
Block a user