mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fast hardlink based cloning of book containers
This commit is contained in:
parent
a117d4c0b6
commit
788d2ab6ea
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, logging, sys, hashlib, uuid, re
|
import os, logging, sys, hashlib, uuid, re, shutil, copy
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from urllib import unquote as urlunquote, quote as urlquote
|
from urllib import unquote as urlunquote, quote as urlquote
|
||||||
@ -30,6 +30,7 @@ from calibre.ebooks.oeb.base import (
|
|||||||
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
||||||
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||||
|
from calibre.utils.filenames import nlinks_file, hardlink_file
|
||||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
@ -47,6 +48,29 @@ class CSSPreProcessor(cssp):
|
|||||||
def __call__(self, data):
|
def __call__(self, data):
|
||||||
return self.MS_PAT.sub(self.ms_sub, data)
|
return self.MS_PAT.sub(self.ms_sub, data)
|
||||||
|
|
||||||
|
def clone_dir(src, dest):
|
||||||
|
' Clone a directory using hard links for the files, dest must already exist '
|
||||||
|
for x in os.listdir(src):
|
||||||
|
dpath = os.path.join(dest, x)
|
||||||
|
spath = os.path.join(src, x)
|
||||||
|
if os.path.isdir(spath):
|
||||||
|
os.mkdir(dpath)
|
||||||
|
clone_dir(spath, dpath)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
hardlink_file(spath, dpath)
|
||||||
|
except:
|
||||||
|
shutil.copy2(spath, dpath)
|
||||||
|
|
||||||
|
def clone_container(container, dest_dir):
|
||||||
|
' Efficiently clone a container using hard links '
|
||||||
|
dest_dir = os.path.abspath(os.path.realpath(dest_dir))
|
||||||
|
clone_data = container.clone_data(dest_dir)
|
||||||
|
cls = type(container)
|
||||||
|
if cls is Container:
|
||||||
|
return cls(None, None, container.log, clone_data=clone_data)
|
||||||
|
return cls(None, container.log, clone_data=clone_data)
|
||||||
|
|
||||||
class Container(object):
|
class Container(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -67,8 +91,8 @@ class Container(object):
|
|||||||
|
|
||||||
book_type = 'oeb'
|
book_type = 'oeb'
|
||||||
|
|
||||||
def __init__(self, rootpath, opfpath, log):
|
def __init__(self, rootpath, opfpath, log, clone_data=None):
|
||||||
self.root = os.path.abspath(rootpath)
|
self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
|
||||||
self.log = log
|
self.log = log
|
||||||
self.html_preprocessor = HTMLPreProcessor()
|
self.html_preprocessor = HTMLPreProcessor()
|
||||||
self.css_preprocessor = CSSPreProcessor()
|
self.css_preprocessor = CSSPreProcessor()
|
||||||
@ -79,6 +103,12 @@ class Container(object):
|
|||||||
self.dirtied = set()
|
self.dirtied = set()
|
||||||
self.encoding_map = {}
|
self.encoding_map = {}
|
||||||
self.pretty_print = set()
|
self.pretty_print = set()
|
||||||
|
self.cloned = False
|
||||||
|
|
||||||
|
if clone_data is not None:
|
||||||
|
self.cloned = True
|
||||||
|
self.name_path_map = clone_data['name_path_map']
|
||||||
|
return
|
||||||
|
|
||||||
# Map of relative paths with '/' separators from root of unzipped ePub
|
# Map of relative paths with '/' separators from root of unzipped ePub
|
||||||
# to absolute paths on filesystem with os-specific separators
|
# to absolute paths on filesystem with os-specific separators
|
||||||
@ -436,7 +466,11 @@ class Container(object):
|
|||||||
self.dirtied.discard(name)
|
self.dirtied.discard(name)
|
||||||
if not keep_parsed:
|
if not keep_parsed:
|
||||||
self.parsed_cache.pop(name)
|
self.parsed_cache.pop(name)
|
||||||
with open(self.name_path_map[name], 'wb') as f:
|
dest = self.name_path_map[name]
|
||||||
|
if self.cloned and nlinks_file(dest) > 1:
|
||||||
|
# Decouple this file from its links
|
||||||
|
os.unlink(dest)
|
||||||
|
with open(dest, 'wb') as f:
|
||||||
f.write(data)
|
f.write(data)
|
||||||
|
|
||||||
def open(self, name, mode='rb'):
|
def open(self, name, mode='rb'):
|
||||||
@ -451,11 +485,29 @@ class Container(object):
|
|||||||
base = os.path.dirname(path)
|
base = os.path.dirname(path)
|
||||||
if not os.path.exists(base):
|
if not os.path.exists(base):
|
||||||
os.makedirs(base)
|
os.makedirs(base)
|
||||||
|
else:
|
||||||
|
if self.cloned and mode not in {'r', 'rb'} and os.path.exists(path) and nlinks_file(path) > 1:
|
||||||
|
# Decouple this file from its links
|
||||||
|
temp = path + 'xxx'
|
||||||
|
shutil.copyfile(path, temp)
|
||||||
|
os.unlink(path)
|
||||||
|
os.rename(temp, path)
|
||||||
return open(path, mode)
|
return open(path, mode)
|
||||||
|
|
||||||
def commit(self, outpath=None):
|
def commit(self, outpath=None, keep_parsed=False):
|
||||||
for name in tuple(self.dirtied):
|
for name in tuple(self.dirtied):
|
||||||
self.commit_item(name)
|
self.commit_item(name, keep_parsed=keep_parsed)
|
||||||
|
|
||||||
|
def clone_data(self, dest_dir):
|
||||||
|
self.commit(keep_parsed=True)
|
||||||
|
self.cloned = True
|
||||||
|
clone_dir(self.root, dest_dir)
|
||||||
|
return {
|
||||||
|
'root': dest_dir,
|
||||||
|
'name_path_map': {
|
||||||
|
name:os.path.join(dest_dir, os.path.relpath(path, self.root))
|
||||||
|
for name, path in self.name_path_map.iteritems()}
|
||||||
|
}
|
||||||
|
|
||||||
def compare_to(self, other):
|
def compare_to(self, other):
|
||||||
if set(self.name_path_map) != set(other.name_path_map):
|
if set(self.name_path_map) != set(other.name_path_map):
|
||||||
@ -487,7 +539,13 @@ class EpubContainer(Container):
|
|||||||
'rights.xml': False,
|
'rights.xml': False,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, pathtoepub, log):
|
def __init__(self, pathtoepub, log, clone_data=None):
|
||||||
|
if clone_data is not None:
|
||||||
|
super(EpubContainer, self).__init__(None, None, log, clone_data=clone_data)
|
||||||
|
for x in ('pathtoepub', 'container', 'obfuscated_fonts'):
|
||||||
|
setattr(self, x, clone_data[x])
|
||||||
|
return
|
||||||
|
|
||||||
self.pathtoepub = pathtoepub
|
self.pathtoepub = pathtoepub
|
||||||
tdir = self.root = os.path.abspath(os.path.realpath(PersistentTemporaryDirectory('_epub_container')))
|
tdir = self.root = os.path.abspath(os.path.realpath(PersistentTemporaryDirectory('_epub_container')))
|
||||||
with open(self.pathtoepub, 'rb') as stream:
|
with open(self.pathtoepub, 'rb') as stream:
|
||||||
@ -527,6 +585,13 @@ class EpubContainer(Container):
|
|||||||
if 'META-INF/encryption.xml' in self.name_path_map:
|
if 'META-INF/encryption.xml' in self.name_path_map:
|
||||||
self.process_encryption()
|
self.process_encryption()
|
||||||
|
|
||||||
|
def clone_data(self, dest_dir):
|
||||||
|
ans = super(EpubContainer, self).clone_data(dest_dir)
|
||||||
|
ans['pathtoepub'] = self.pathtoepub
|
||||||
|
ans['obfuscated_fonts'] = self.obfuscated_fonts.copy()
|
||||||
|
ans['container'] = copy.deepcopy(self.container)
|
||||||
|
return ans
|
||||||
|
|
||||||
def process_encryption(self):
|
def process_encryption(self):
|
||||||
fonts = {}
|
fonts = {}
|
||||||
enc = self.parsed('META-INF/encryption.xml')
|
enc = self.parsed('META-INF/encryption.xml')
|
||||||
@ -578,8 +643,8 @@ class EpubContainer(Container):
|
|||||||
decrypt_font(tkey, path, alg)
|
decrypt_font(tkey, path, alg)
|
||||||
self.obfuscated_fonts[font] = (alg, tkey)
|
self.obfuscated_fonts[font] = (alg, tkey)
|
||||||
|
|
||||||
def commit(self, outpath=None):
|
def commit(self, outpath=None, keep_parsed=False):
|
||||||
super(EpubContainer, self).commit()
|
super(EpubContainer, self).commit(keep_parsed=keep_parsed)
|
||||||
for name in self.obfuscated_fonts:
|
for name in self.obfuscated_fonts:
|
||||||
if name not in self.name_path_map:
|
if name not in self.name_path_map:
|
||||||
continue
|
continue
|
||||||
@ -620,7 +685,13 @@ class AZW3Container(Container):
|
|||||||
|
|
||||||
book_type = 'azw3'
|
book_type = 'azw3'
|
||||||
|
|
||||||
def __init__(self, pathtoazw3, log):
|
def __init__(self, pathtoazw3, log, clone_data=None):
|
||||||
|
if clone_data is not None:
|
||||||
|
super(AZW3Container, self).__init__(None, None, log, clone_data=clone_data)
|
||||||
|
for x in ('pathtoazw3', 'obfuscated_fonts'):
|
||||||
|
setattr(self, x, clone_data[x])
|
||||||
|
return
|
||||||
|
|
||||||
self.pathtoazw3 = pathtoazw3
|
self.pathtoazw3 = pathtoazw3
|
||||||
tdir = self.root = os.path.abspath(os.path.realpath(PersistentTemporaryDirectory('_azw3_container')))
|
tdir = self.root = os.path.abspath(os.path.realpath(PersistentTemporaryDirectory('_azw3_container')))
|
||||||
with open(pathtoazw3, 'rb') as stream:
|
with open(pathtoazw3, 'rb') as stream:
|
||||||
@ -659,8 +730,14 @@ class AZW3Container(Container):
|
|||||||
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
||||||
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
||||||
|
|
||||||
def commit(self, outpath=None):
|
def clone_data(self, dest_dir):
|
||||||
super(AZW3Container, self).commit()
|
ans = super(AZW3Container, self).clone_data(dest_dir)
|
||||||
|
ans['pathtoazw3'] = self.pathtoazw3
|
||||||
|
ans['obfuscated_fonts'] = self.obfuscated_fonts.copy()
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def commit(self, outpath=None, keep_parsed=False):
|
||||||
|
super(AZW3Container, self).commit(keep_parsed=keep_parsed)
|
||||||
if outpath is None:
|
if outpath is None:
|
||||||
outpath = self.pathtoazw3
|
outpath = self.pathtoazw3
|
||||||
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||||
|
@ -295,6 +295,15 @@ def windows_hardlink(src, dest):
|
|||||||
msg = u'Creating hardlink from %s to %s failed: %%s' % (src, dest)
|
msg = u'Creating hardlink from %s to %s failed: %%s' % (src, dest)
|
||||||
raise Exception(msg % ('hardlink size: %d not the same as source size' % sz))
|
raise Exception(msg % ('hardlink size: %d not the same as source size' % sz))
|
||||||
|
|
||||||
|
def windows_nlinks(path):
|
||||||
|
import win32file
|
||||||
|
dwFlagsAndAttributes = win32file.FILE_FLAG_BACKUP_SEMANTICS if os.path.isdir(path) else 0
|
||||||
|
handle = win32file.CreateFile(path, win32file.GENERIC_READ, win32file.FILE_SHARE_READ, None, win32file.OPEN_EXISTING, dwFlagsAndAttributes, None)
|
||||||
|
try:
|
||||||
|
return win32file.GetFileInformationByHandle(handle)[7]
|
||||||
|
finally:
|
||||||
|
handle.Close()
|
||||||
|
|
||||||
class WindowsAtomicFolderMove(object):
|
class WindowsAtomicFolderMove(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -400,6 +409,12 @@ def hardlink_file(src, dest):
|
|||||||
return
|
return
|
||||||
os.link(src, dest)
|
os.link(src, dest)
|
||||||
|
|
||||||
|
def nlinks_file(path):
|
||||||
|
' Return number of hardlinks to the file '
|
||||||
|
if iswindows:
|
||||||
|
return windows_nlinks(path)
|
||||||
|
return os.stat(path).st_nlink
|
||||||
|
|
||||||
def atomic_rename(oldpath, newpath):
|
def atomic_rename(oldpath, newpath):
|
||||||
'''Replace the file newpath with the file oldpath. Can fail if the files
|
'''Replace the file newpath with the file oldpath. Can fail if the files
|
||||||
are on different volumes. If succeeds, guaranteed to be atomic. newpath may
|
are on different volumes. If succeeds, guaranteed to be atomic. newpath may
|
||||||
|
Loading…
x
Reference in New Issue
Block a user