mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup reading metadata from OPF files by not reading the NCX
Also do not render the first page of the EPUB if it is encrypted. The old logic for detecting this was broken in the case where the OPF file is not in the root of the EPUB. Also make the OPF metadata reading code no longer depend directly on the opf2.OPF class.
This commit is contained in:
parent
921a579d35
commit
ca2d3986cd
@ -292,12 +292,12 @@ class DocXMetadataReader(MetadataReaderPlugin):
|
|||||||
class OPFMetadataReader(MetadataReaderPlugin):
|
class OPFMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
name = 'Read OPF metadata'
|
name = 'Read OPF metadata'
|
||||||
file_types = set(['opf'])
|
file_types = {'opf'}
|
||||||
description = _('Read metadata from %s files')%'OPF'
|
description = _('Read metadata from %s files')%'OPF'
|
||||||
|
|
||||||
def get_metadata(self, stream, ftype):
|
def get_metadata(self, stream, ftype):
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf import get_metadata
|
||||||
return OPF(stream, os.getcwdu()).to_book_metadata()
|
return get_metadata(stream)[0]
|
||||||
|
|
||||||
class PDBMetadataReader(MetadataReaderPlugin):
|
class PDBMetadataReader(MetadataReaderPlugin):
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
|
|||||||
from calibre.utils.localunzip import LocalZipFile
|
from calibre.utils.localunzip import LocalZipFile
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.ebooks.metadata.opf import get_metadata as get_metadata_from_opf
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
from calibre.ptempfile import TemporaryDirectory, PersistentTemporaryFile
|
from calibre.ptempfile import TemporaryDirectory, PersistentTemporaryFile
|
||||||
from calibre import CurrentDir, walk
|
from calibre import CurrentDir, walk
|
||||||
@ -96,16 +97,27 @@ class OCFReader(OCF):
|
|||||||
self.opf_path = self.container[OPF.MIMETYPE]
|
self.opf_path = self.container[OPF.MIMETYPE]
|
||||||
if not self.opf_path:
|
if not self.opf_path:
|
||||||
raise EPubException("missing OPF package file entry in container")
|
raise EPubException("missing OPF package file entry in container")
|
||||||
|
self._opf_cached = self._encryption_meta_cached = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def opf(self):
|
||||||
|
if self._opf_cached is None:
|
||||||
try:
|
try:
|
||||||
with closing(self.open(self.opf_path)) as f:
|
with closing(self.open(self.opf_path)) as f:
|
||||||
self.opf = OPF(f, self.root, populate_spine=False)
|
self._opf_cached = OPF(f, self.root, populate_spine=False)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise EPubException("missing OPF package file")
|
raise EPubException("missing OPF package file")
|
||||||
|
return self._opf_cached
|
||||||
|
|
||||||
|
@property
|
||||||
|
def encryption_meta(self):
|
||||||
|
if self._encryption_meta_cached is None:
|
||||||
try:
|
try:
|
||||||
with closing(self.open(self.ENCRYPTION_PATH)) as f:
|
with closing(self.open(self.ENCRYPTION_PATH)) as f:
|
||||||
self.encryption_meta = Encryption(f.read())
|
self._encryption_meta_cached = Encryption(f.read())
|
||||||
except:
|
except:
|
||||||
self.encryption_meta = Encryption(None)
|
self._encryption_meta_cached = Encryption(None)
|
||||||
|
return self._encryption_meta_cached
|
||||||
|
|
||||||
|
|
||||||
class OCFZipReader(OCFReader):
|
class OCFZipReader(OCFReader):
|
||||||
@ -147,11 +159,10 @@ class OCFDirReader(OCFReader):
|
|||||||
def open(self, path, *args, **kwargs):
|
def open(self, path, *args, **kwargs):
|
||||||
return open(os.path.join(self.root, path), *args, **kwargs)
|
return open(os.path.join(self.root, path), *args, **kwargs)
|
||||||
|
|
||||||
def render_cover(opf, opf_path, zf, reader=None):
|
def render_cover(cpage, zf, reader=None):
|
||||||
from calibre.ebooks import render_html_svg_workaround
|
from calibre.ebooks import render_html_svg_workaround
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
cpage = opf.first_spine_item()
|
|
||||||
if not cpage:
|
if not cpage:
|
||||||
return
|
return
|
||||||
if reader is not None and reader.encryption_meta.is_encrypted(cpage):
|
if reader is not None and reader.encryption_meta.is_encrypted(cpage):
|
||||||
@ -160,8 +171,7 @@ def render_cover(opf, opf_path, zf, reader=None):
|
|||||||
with TemporaryDirectory('_epub_meta') as tdir:
|
with TemporaryDirectory('_epub_meta') as tdir:
|
||||||
with CurrentDir(tdir):
|
with CurrentDir(tdir):
|
||||||
zf.extractall()
|
zf.extractall()
|
||||||
opf_path = opf_path.replace('/', os.sep)
|
cpage = os.path.join(tdir, cpage)
|
||||||
cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage)
|
|
||||||
if not os.path.exists(cpage):
|
if not os.path.exists(cpage):
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -175,7 +185,7 @@ def render_cover(opf, opf_path, zf, reader=None):
|
|||||||
os.remove(f)
|
os.remove(f)
|
||||||
ffpat = re.compile(br'@font-face.*?{.*?}',
|
ffpat = re.compile(br'@font-face.*?{.*?}',
|
||||||
re.DOTALL|re.IGNORECASE)
|
re.DOTALL|re.IGNORECASE)
|
||||||
with open(cpage, 'r+b') as f:
|
with lopen(cpage, 'r+b') as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.truncate(0)
|
f.truncate(0)
|
||||||
f.seek(0)
|
f.seek(0)
|
||||||
@ -190,7 +200,7 @@ def render_cover(opf, opf_path, zf, reader=None):
|
|||||||
if href:
|
if href:
|
||||||
path = os.path.join(os.path.dirname(cpage), href)
|
path = os.path.join(os.path.dirname(cpage), href)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
with open(path, 'r+b') as f:
|
with lopen(path, 'r+b') as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
f.truncate(0)
|
f.truncate(0)
|
||||||
f.seek(0)
|
f.seek(0)
|
||||||
@ -199,24 +209,15 @@ def render_cover(opf, opf_path, zf, reader=None):
|
|||||||
|
|
||||||
return render_html_svg_workaround(cpage, default_log)
|
return render_html_svg_workaround(cpage, default_log)
|
||||||
|
|
||||||
def get_cover(opf, opf_path, stream, reader=None):
|
def get_cover(raster_cover, first_spine_item, reader):
|
||||||
raster_cover = opf.raster_cover
|
zf = reader.archive
|
||||||
stream.seek(0)
|
|
||||||
try:
|
|
||||||
zf = ZipFile(stream)
|
|
||||||
except:
|
|
||||||
stream.seek(0)
|
|
||||||
zf = LocalZipFile(stream)
|
|
||||||
|
|
||||||
if raster_cover:
|
if raster_cover:
|
||||||
base = posixpath.dirname(opf_path)
|
if reader.encryption_meta.is_encrypted(raster_cover):
|
||||||
cpath = posixpath.normpath(posixpath.join(base, raster_cover))
|
|
||||||
if reader is not None and \
|
|
||||||
reader.encryption_meta.is_encrypted(cpath):
|
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
member = zf.getinfo(cpath)
|
member = zf.getinfo(raster_cover)
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
f = zf.open(member)
|
f = zf.open(member)
|
||||||
@ -225,19 +226,25 @@ def get_cover(opf, opf_path, stream, reader=None):
|
|||||||
zf.close()
|
zf.close()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
return render_cover(opf, opf_path, zf, reader=reader)
|
return render_cover(first_spine_item, zf, reader=reader)
|
||||||
|
|
||||||
def get_metadata(stream, extract_cover=True):
|
def get_metadata(stream, extract_cover=True):
|
||||||
""" Return metadata as a :class:`Metadata` object """
|
""" Return metadata as a :class:`Metadata` object """
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
reader = get_zip_reader(stream)
|
reader = get_zip_reader(stream)
|
||||||
mi = reader.opf.to_book_metadata()
|
opfstream = reader.open(reader.opf_path)
|
||||||
|
mi, ver, raster_cover, first_spine_item = get_metadata_from_opf(opfstream)
|
||||||
if extract_cover:
|
if extract_cover:
|
||||||
|
base = posixpath.dirname(reader.opf_path)
|
||||||
|
if raster_cover:
|
||||||
|
raster_cover = posixpath.normpath(posixpath.join(base, raster_cover))
|
||||||
|
if first_spine_item:
|
||||||
|
first_spine_item = posixpath.normpath(posixpath.join(base, first_spine_item))
|
||||||
try:
|
try:
|
||||||
cdata = get_cover(reader.opf, reader.opf_path, stream, reader=reader)
|
cdata = get_cover(raster_cover, first_spine_item, reader)
|
||||||
if cdata is not None:
|
if cdata is not None:
|
||||||
mi.cover_data = ('jpg', cdata)
|
mi.cover_data = ('jpg', cdata)
|
||||||
except:
|
except Exception:
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
mi.timestamp = None
|
mi.timestamp = None
|
||||||
|
@ -14,9 +14,9 @@ from urlparse import urlparse
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks import escape_xpath_attr
|
from calibre.ebooks import escape_xpath_attr
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
|
||||||
from calibre.constants import __appname__, __version__, filesystem_encoding
|
from calibre.constants import __appname__, __version__, filesystem_encoding
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.metadata.utils import parse_opf
|
||||||
from calibre.ebooks.metadata import string_to_authors, MetaInformation, check_isbn
|
from calibre.ebooks.metadata import string_to_authors, MetaInformation, check_isbn
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.utils.date import parse_date, isoformat
|
from calibre.utils.date import parse_date, isoformat
|
||||||
@ -502,7 +502,6 @@ def dump_dict(cats):
|
|||||||
class OPF(object): # {{{
|
class OPF(object): # {{{
|
||||||
|
|
||||||
MIMETYPE = 'application/oebps-package+xml'
|
MIMETYPE = 'application/oebps-package+xml'
|
||||||
PARSER = etree.XMLParser(recover=True)
|
|
||||||
NAMESPACES = {
|
NAMESPACES = {
|
||||||
None: "http://www.idpf.org/2007/opf",
|
None: "http://www.idpf.org/2007/opf",
|
||||||
'dc': "http://purl.org/dc/elements/1.1/",
|
'dc': "http://purl.org/dc/elements/1.1/",
|
||||||
@ -564,21 +563,11 @@ class OPF(object): # {{{
|
|||||||
formatter=json.loads, renderer=dump_dict)
|
formatter=json.loads, renderer=dump_dict)
|
||||||
|
|
||||||
def __init__(self, stream, basedir=os.getcwdu(), unquote_urls=True,
|
def __init__(self, stream, basedir=os.getcwdu(), unquote_urls=True,
|
||||||
populate_spine=True, try_to_guess_cover=True):
|
populate_spine=True, try_to_guess_cover=True, preparsed_opf=None, read_toc=True):
|
||||||
if not hasattr(stream, 'read'):
|
|
||||||
stream = open(stream, 'rb')
|
|
||||||
raw = stream.read()
|
|
||||||
if not raw:
|
|
||||||
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
|
|
||||||
self.try_to_guess_cover = try_to_guess_cover
|
self.try_to_guess_cover = try_to_guess_cover
|
||||||
self.basedir = self.base_dir = basedir
|
self.basedir = self.base_dir = basedir
|
||||||
self.path_to_html_toc = self.html_toc_fragment = None
|
self.path_to_html_toc = self.html_toc_fragment = None
|
||||||
raw, self.encoding = xml_to_unicode(raw, strip_encoding_pats=True,
|
self.root = parse_opf(stream) if preparsed_opf is None else preparsed_opf
|
||||||
resolve_entities=True, assume_utf8=True)
|
|
||||||
raw = raw[raw.find('<'):]
|
|
||||||
self.root = etree.fromstring(raw, self.PARSER)
|
|
||||||
if self.root is None:
|
|
||||||
raise ValueError('Not an OPF file')
|
|
||||||
try:
|
try:
|
||||||
self.package_version = float(self.root.get('version', None))
|
self.package_version = float(self.root.get('version', None))
|
||||||
except (AttributeError, TypeError, ValueError):
|
except (AttributeError, TypeError, ValueError):
|
||||||
@ -603,7 +592,10 @@ class OPF(object): # {{{
|
|||||||
guide = self.guide_path(self.root)
|
guide = self.guide_path(self.root)
|
||||||
self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
|
self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
|
||||||
self.cover_data = (None, None)
|
self.cover_data = (None, None)
|
||||||
|
if read_toc:
|
||||||
self.find_toc()
|
self.find_toc()
|
||||||
|
else:
|
||||||
|
self.toc = None
|
||||||
self.read_user_metadata()
|
self.read_user_metadata()
|
||||||
|
|
||||||
def read_user_metadata(self):
|
def read_user_metadata(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user