mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Merge branch 'pdf-meta-check' of https://github.com/sengian/calibre
Look for recognizable identifiers (ISBN/DOI) in the Info dict and dc:identifier fields, just in case.
This commit is contained in:
commit
ebdf6738f3
@ -16,7 +16,7 @@ from calibre.utils.config_base import tweaks
|
|||||||
try:
|
try:
|
||||||
_author_pat = re.compile(tweaks['authors_split_regex'])
|
_author_pat = re.compile(tweaks['authors_split_regex'])
|
||||||
except:
|
except:
|
||||||
prints ('Author split regexp:', tweaks['authors_split_regex'],
|
prints('Author split regexp:', tweaks['authors_split_regex'],
|
||||||
'is invalid, using default')
|
'is invalid, using default')
|
||||||
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
|
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
|
||||||
|
|
||||||
@ -150,7 +150,6 @@ coding = zip(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def roman(num):
|
def roman(num):
|
||||||
if num <= 0 or num >= 4000 or int(num) != num:
|
if num <= 0 or num >= 4000 or int(num) != num:
|
||||||
return str(num)
|
return str(num)
|
||||||
@ -174,6 +173,7 @@ def fmt_sidx(i, fmt='%.2f', use_roman=False):
|
|||||||
return fmt%i
|
return fmt%i
|
||||||
|
|
||||||
class Resource(object):
|
class Resource(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Represents a resource (usually a file on the filesystem or a URL pointing
|
Represents a resource (usually a file on the filesystem or a URL pointing
|
||||||
to the web. Such resources are commonly referred to in OPF files.
|
to the web. Such resources are commonly referred to in OPF files.
|
||||||
@ -217,7 +217,6 @@ class Resource(object):
|
|||||||
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
|
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
|
||||||
self.fragment = unquote(url[-1])
|
self.fragment = unquote(url[-1])
|
||||||
|
|
||||||
|
|
||||||
def href(self, basedir=None):
|
def href(self, basedir=None):
|
||||||
'''
|
'''
|
||||||
Return a URL pointing to this resource. If it is a file on the filesystem
|
Return a URL pointing to this resource. If it is a file on the filesystem
|
||||||
@ -240,7 +239,7 @@ class Resource(object):
|
|||||||
return ''+frag
|
return ''+frag
|
||||||
try:
|
try:
|
||||||
rpath = relpath(self.path, basedir)
|
rpath = relpath(self.path, basedir)
|
||||||
except OSError: # On windows path and basedir could be on different drives
|
except OSError: # On windows path and basedir could be on different drives
|
||||||
rpath = self.path
|
rpath = self.path
|
||||||
if isinstance(rpath, unicode):
|
if isinstance(rpath, unicode):
|
||||||
rpath = rpath.encode('utf-8')
|
rpath = rpath.encode('utf-8')
|
||||||
@ -308,7 +307,6 @@ class ResourceCollection(object):
|
|||||||
res.set_basedir(path)
|
res.set_basedir(path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def MetaInformation(title, authors=(_('Unknown'),)):
|
def MetaInformation(title, authors=(_('Unknown'),)):
|
||||||
''' Convenient encapsulation of book metadata, needed for compatibility
|
''' Convenient encapsulation of book metadata, needed for compatibility
|
||||||
@param title: title or ``_('Unknown')`` or a MetaInformation object
|
@param title: title or ``_('Unknown')`` or a MetaInformation object
|
||||||
@ -368,3 +366,12 @@ def format_isbn(isbn):
|
|||||||
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
|
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
|
||||||
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
|
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
|
||||||
|
|
||||||
|
def check_doi(doi):
|
||||||
|
'Check if something that looks like a DOI is present anywhere in the string'
|
||||||
|
if not doi:
|
||||||
|
return None
|
||||||
|
doi_check = re.search(r'10\.\d{4}/\S+', doi)
|
||||||
|
if doi_check is not None:
|
||||||
|
return doi_check.group()
|
||||||
|
return None
|
||||||
|
|
||||||
|
@ -9,7 +9,8 @@ from functools import partial
|
|||||||
from calibre import prints
|
from calibre import prints
|
||||||
from calibre.constants import iswindows
|
from calibre.constants import iswindows
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
|
from calibre.ebooks.metadata import (
|
||||||
|
MetaInformation, string_to_authors, check_isbn, check_doi)
|
||||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||||
|
|
||||||
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
|
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
|
||||||
@ -134,6 +135,18 @@ def get_metadata(stream, cover=True):
|
|||||||
if 'xmp_metadata' in info:
|
if 'xmp_metadata' in info:
|
||||||
from calibre.ebooks.metadata.xmp import consolidate_metadata
|
from calibre.ebooks.metadata.xmp import consolidate_metadata
|
||||||
mi = consolidate_metadata(mi, info['xmp_metadata'])
|
mi = consolidate_metadata(mi, info['xmp_metadata'])
|
||||||
|
|
||||||
|
# Look for recognizable identifiers in the info dict, if they were not
|
||||||
|
# found in the XMP metadata
|
||||||
|
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
|
||||||
|
if scheme not in mi.get_identifiers():
|
||||||
|
for k, v in info.iteritems():
|
||||||
|
if k != 'xmp_metadata':
|
||||||
|
val = check_func(v)
|
||||||
|
if val:
|
||||||
|
mi.set_identifier(scheme, val)
|
||||||
|
break
|
||||||
|
|
||||||
if cdata:
|
if cdata:
|
||||||
mi.cover_data = ('jpeg', cdata)
|
mi.cover_data = ('jpeg', cdata)
|
||||||
return mi
|
return mi
|
||||||
|
@ -14,7 +14,7 @@ from lxml import etree
|
|||||||
from lxml.builder import ElementMaker
|
from lxml.builder import ElementMaker
|
||||||
|
|
||||||
from calibre import replace_entities
|
from calibre import replace_entities
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn, check_doi
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.utils.date import parse_date, isoformat, now
|
from calibre.utils.date import parse_date, isoformat, now
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ def read_simple_property(elem):
|
|||||||
return replace_entities(elem.get(expand('rdf:resource'), ''))
|
return replace_entities(elem.get(expand('rdf:resource'), ''))
|
||||||
|
|
||||||
def read_lang_alt(parent):
|
def read_lang_alt(parent):
|
||||||
# A text value with possibel alternate values in different languages
|
# A text value with possible alternate values in different languages
|
||||||
items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent)
|
items = XPath('descendant::rdf:li[@xml:lang="x-default"]')(parent)
|
||||||
if items:
|
if items:
|
||||||
return items[0]
|
return items[0]
|
||||||
@ -114,7 +114,7 @@ def multiple_sequences(expr, root):
|
|||||||
|
|
||||||
def first_alt(expr, root):
|
def first_alt(expr, root):
|
||||||
# The first element matching expr, assumes that the element contains a
|
# The first element matching expr, assumes that the element contains a
|
||||||
# langauge alternate array
|
# language alternate array
|
||||||
for item in XPath(expr)(root):
|
for item in XPath(expr)(root):
|
||||||
q = read_simple_property(read_lang_alt(item))
|
q = read_simple_property(read_lang_alt(item))
|
||||||
if q:
|
if q:
|
||||||
@ -195,8 +195,18 @@ def metadata_from_xmp_packet(raw_bytes):
|
|||||||
scheme = scheme.lower()
|
scheme = scheme.lower()
|
||||||
if scheme == 'isbn':
|
if scheme == 'isbn':
|
||||||
val = check_isbn(val)
|
val = check_isbn(val)
|
||||||
|
elif scheme == 'doi':
|
||||||
|
val = check_doi(val)
|
||||||
if val:
|
if val:
|
||||||
identifiers[scheme] = val
|
identifiers[scheme] = val
|
||||||
|
|
||||||
|
# Check Dublin Core for recognizable identifier types
|
||||||
|
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
|
||||||
|
if scheme not in identifiers:
|
||||||
|
val = check_func(first_simple('//dc:identifier', root))
|
||||||
|
if val:
|
||||||
|
identifiers['doi'] = val
|
||||||
|
|
||||||
if identifiers:
|
if identifiers:
|
||||||
mi.set_identifiers(identifiers)
|
mi.set_identifiers(identifiers)
|
||||||
|
|
||||||
@ -360,7 +370,7 @@ def merge_xmp_packet(old, new):
|
|||||||
# First remove all data fields that are defined in the new packet from the
|
# First remove all data fields that are defined in the new packet from the
|
||||||
# old packet
|
# old packet
|
||||||
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
|
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES}
|
||||||
defined_tags |= {x.tag for x in item_xpath(new)}
|
defined_tags |= {x.tag for x in item_xpath(new)} | {expand('dc:identifier')}
|
||||||
for elem in item_xpath(old):
|
for elem in item_xpath(old):
|
||||||
if elem.tag in defined_tags:
|
if elem.tag in defined_tags:
|
||||||
elem.getparent().remove(elem)
|
elem.getparent().remove(elem)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user