Implement renaming in book containers

This commit is contained in:
Kovid Goyal 2013-10-21 14:29:53 +05:30
parent f48d9a47ca
commit f83313f593
6 changed files with 278 additions and 32 deletions

View File

@ -100,6 +100,12 @@ def close_self_closing_tags(raw):
def uuid_id():
return 'u'+unicode(uuid.uuid4())
def itercsslinks(raw):
for match in _css_url_re.finditer(raw):
yield match.group(1), match.start(1)
for match in _css_import_re.finditer(raw):
yield match.group(1), match.start(1)
def iterlinks(root, find_links_in_css=True):
'''
Iterate over all links in a OEB Document.

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, logging, sys, hashlib, uuid, re, shutil, copy
import os, logging, sys, hashlib, uuid, re, shutil
from collections import defaultdict
from io import BytesIO
from urllib import unquote as urlunquote, quote as urlquote
@ -15,6 +15,7 @@ from urlparse import urlparse
from future_builtins import zip
from lxml import etree
from cssutils import replaceUrls, getUrls
from calibre import guess_type as _guess_type, CurrentDir
from calibre.customize.ui import (plugin_for_input_format,
@ -27,7 +28,8 @@ from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.mobi.tweak import set_cover
from calibre.ebooks.oeb.base import (
serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF,
rewrite_links, iterlinks, itercsslinks)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@ -105,6 +107,7 @@ class Container(object): # {{{
self.encoding_map = {}
self.pretty_print = set()
self.cloned = False
self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')
if clone_data is not None:
self.cloned = True
@ -153,6 +156,100 @@ class Container(object): # {{{
for name, path in self.name_path_map.iteritems()}
}
def rename(self, current_name, new_name):
''' Renames a file from current_name to new_name. It automatically
rebases all links inside the file if the directory the file is in
changes. Note however, that links are not updated in the other files
that could reference this file. This is for performance, such updates
should be done once, in bulk. '''
if current_name in self.names_that_must_not_be_changed:
raise ValueError('Renaming of %s is not allowed' % current_name)
if self.exists(new_name):
raise ValueError('Cannot rename %s to %s as %s already exists' % (self.opf_name, new_name, new_name))
new_path = self.name_to_abspath(new_name)
base = os.path.dirname(new_path)
if os.path.isfile(base):
raise ValueError('Cannot rename %s to %s as %s is a file' % (self.opf_name, new_name, base))
if not os.path.exists(base):
os.makedirs(base)
old_path = parent_dir = self.name_to_abspath(current_name)
self.commit_item(current_name)
os.rename(old_path, new_path)
# Remove empty directories
while parent_dir:
parent_dir = os.path.dirname(parent_dir)
try:
os.rmdir(parent_dir)
except EnvironmentError:
break
for x in ('mime_map', 'encoding_map'):
x = getattr(self, x)
if current_name in x:
x[new_name] = x[current_name]
self.name_path_map[new_name] = new_path
for x in self.cache_names:
x = getattr(self, x)
try:
x.pop(current_name, None)
except TypeError:
x.discard(current_name)
if current_name == self.opf_name:
self.opf_name = new_name
if os.path.dirname(old_path) != os.path.dirname(new_path):
from calibre.ebooks.oeb.polish.replace import LinkRebaser
repl = LinkRebaser(self, current_name, new_name)
self.replace_links(new_name, repl)
self.dirty(new_name)
def replace_links(self, name, replace_func):
''' Replace all links in name using replace_func, which must be a
callable that accepts a URL and returns the replaced URL. It must also
have a 'replaced' attribute that is set to True if any actual
replacement is done. Convenient ways of creating such callables are
using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
media_type = self.mime_map.get(name, guess_type(name))
if name == self.opf_name:
for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS:
rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES:
replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'):
for elem in self.parsed(name).xpath('//*[@src]'):
elem.set('src', replace_func(elem.get('src')))
if replace_func.replaced:
self.dirty(name)
return replace_func.replaced
def iterlinks(self, name, get_line_numbers=True):
''' Iterate over all links in name. If get_line_numbers is True the
yields results of the form (link, line_number, offset). Where
line_number is the line_number at which the link occurs and offset is
the number of characters from the start of the line. Note that offset
could actually encompass several lines if not zero. '''
media_type = self.mime_map.get(name, guess_type(name))
if name == self.opf_name:
for elem in self.opf_xpath('//*[@href]'):
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
elif media_type.lower() in OEB_DOCS:
for el, attr, link, pos in iterlinks(self.parsed(name)):
yield (link, el.sourceline, pos) if get_line_numbers else link
elif media_type.lower() in OEB_STYLES:
if get_line_numbers:
with self.open(name) as f:
raw = self.decode(f.read())
for link, offset in itercsslinks(raw):
yield link, 0, offset
else:
for link in getUrls(self.parsed(name)):
yield link
elif media_type.lower() == guess_type('toc.ncx'):
for elem in self.parsed(name).xpath('//*[@src]'):
yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def abspath_to_name(self, fullpath):
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
@ -648,15 +745,15 @@ class EpubContainer(Container):
container_path = join(self.root, 'META-INF', 'container.xml')
if not exists(container_path):
raise InvalidEpub('No META-INF/container.xml in epub')
self.container = etree.fromstring(open(container_path, 'rb').read())
opf_files = self.container.xpath((
container = etree.fromstring(open(container_path, 'rb').read())
opf_files = container.xpath((
r'child::ocf:rootfiles/ocf:rootfile'
'[@media-type="%s" and @full-path]'%guess_type('a.opf')
), namespaces={'ocf':OCF_NS}
)
if not opf_files:
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/'))
opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
if not exists(opf_path):
raise InvalidEpub('OPF file does not exist at location pointed to'
' by META-INF/container.xml')
@ -666,14 +763,35 @@ class EpubContainer(Container):
self.obfuscated_fonts = {}
if 'META-INF/encryption.xml' in self.name_path_map:
self.process_encryption()
self.parsed_cache['META-INF/container.xml'] = container
def clone_data(self, dest_dir):
ans = super(EpubContainer, self).clone_data(dest_dir)
ans['pathtoepub'] = self.pathtoepub
ans['obfuscated_fonts'] = self.obfuscated_fonts.copy()
ans['container'] = copy.deepcopy(self.container)
return ans
def rename(self, old_name, new_name):
is_opf = old_name == self.opf_name
super(EpubContainer, self).rename(old_name, new_name)
if is_opf:
for elem in self.parsed('META-INF/container.xml').xpath((
r'child::ocf:rootfiles/ocf:rootfile'
'[@media-type="%s" and @full-path]'%guess_type('a.opf')
), namespaces={'ocf':OCF_NS}
):
# The asinine epubcheck cannot handle quoted filenames in
# container.xml
elem.set('full-path', self.opf_name)
self.dirty('META-INF/container.xml')
if old_name in self.obfuscated_fonts:
self.obfuscated_fonts[new_name] = self.obfuscated_fonts.pop(old_name)
enc = self.parsed('META-INF/encryption.xml')
for cr in enc.xpath('//*[local-name()="CipherReference" and @URI]'):
if self.href_to_name(cr.get('URI')) == old_name:
cr.set('URI', self.name_to_href(new_name))
self.dirty('META-INF/encryption.xml')
@property
def names_that_need_not_be_manifested(self):
return super(EpubContainer, self).names_that_need_not_be_manifested | {'META-INF/' + x for x in self.META_INF}

View File

@ -10,11 +10,7 @@ __docformat__ = 'restructuredtext en'
import codecs
from urlparse import urlparse
from cssutils import replaceUrls
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.oeb.polish.container import guess_type
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
class LinkReplacer(object):
@ -26,6 +22,12 @@ class LinkReplacer(object):
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
repl = self.frag_map(self.base, url[1:])
if not repl or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
@ -42,23 +44,37 @@ class LinkReplacer(object):
self.replaced = True
return href
def replace_links(container, link_map, frag_map=lambda name, frag:frag):
ncx_type = guess_type('toc.ncx')
for name, media_type in container.mime_map.iteritems():
repl = LinkReplacer(name, container, link_map, frag_map)
if media_type.lower() in OEB_DOCS:
rewrite_links(container.parsed(name), repl)
elif media_type.lower() in OEB_STYLES:
replaceUrls(container.parsed(name), repl)
elif media_type.lower() == ncx_type:
for elem in container.parsed(name).xpath('//*[@src]'):
src = elem.get('src')
nsrc = repl(src)
if src != nsrc:
elem.set('src', nsrc)
class LinkRebaser(object):
if repl.replaced:
container.dirty(name)
def __init__(self, container, old_name, new_name):
self.old_name, self.new_name = old_name, new_name
self.container = container
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
purl = urlparse(url)
frag = purl.fragment
name = self.container.href_to_name(url, self.old_name)
if not name:
return url
if name == self.old_name:
name = self.new_name
href = self.container.name_to_href(name, self.new_name)
if frag:
href += '#' + frag
if href != url:
self.replaced = True
return href
def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
for name, media_type in container.mime_map.iteritems():
if name == container.opf_name and not replace_in_opf:
continue
repl = LinkReplacer(name, container, link_map, frag_map)
container.replace_links(name, repl)
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
@ -83,3 +99,19 @@ def smarten_punctuation(container, report):
m.getparent().remove(m)
container.dirty(name)
def rename_files(container, file_map):
overlap = set(file_map).intersection(set(file_map.itervalues()))
if overlap:
raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
for name, dest in file_map.iteritems():
if container.exists(dest):
raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
if len(tuple(file_map.itervalues())) != len(set(file_map.itervalues())):
raise ValueError('Cannot rename, the set of destination files contains duplicates')
link_map = {}
for current_name, new_name in file_map.iteritems():
container.rename(current_name, new_name)
if new_name != container.opf_name: # OPF is handled by the container
link_map[current_name] = new_name
replace_links(container, link_map, replace_in_opf=True)

View File

@ -45,6 +45,7 @@ def get_simple_book(fmt='epub'):
raw = raw.replace('LMONOI', P('fonts/liberation/LiberationMono-Italic.ttf'))
raw = raw.replace('LMONO', P('fonts/liberation/LiberationMono-Regular.ttf'))
raw = raw.replace('IMAGE1', I('marked.png'))
raw = raw.replace('IMAGE2', I('textures/light_wood.png'))
try:
with open(x, 'wb') as f:
f.write(raw.encode('utf-8'))
@ -69,3 +70,10 @@ class BaseTest(unittest.TestCase):
shutil.rmtree(self.tdir, ignore_errors=True)
del self.tdir
def check_links(self, container):
for name in container.name_path_map:
for link in container.iterlinks(name, get_line_numbers=False):
dest = container.href_to_name(link, name)
if dest:
self.assertTrue(container.exists(dest), 'The link %s in %s does not exist' % (link, name))

View File

@ -6,12 +6,14 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
import os, subprocess
from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book
from calibre.ebooks.oeb.polish.container import get_container, clone_container
from calibre.ebooks.oeb.polish.container import get_container, clone_container, OCF_NS
from calibre.ebooks.oeb.polish.replace import rename_files
from calibre.utils.filenames import nlinks_file
from calibre.ptempfile import TemporaryFile
class ContainerTests(BaseTest):
@ -73,3 +75,82 @@ class ContainerTests(BaseTest):
for x in files:
self.assertNotIn(x, raw)
def run_external_tools(self, container, gvim=False, epubcheck=True):
with TemporaryFile(suffix='.epub', dir=self.tdir) as f:
container.commit(outpath=f)
if gvim:
subprocess.Popen(['gvim', '-f', f]).wait()
if epubcheck:
subprocess.Popen(['epubcheck', f]).wait()
def test_file_rename(self):
' Test renaming of files '
book = get_simple_book()
count = [0]
def new_container():
count[0] += 1
tdir = os.mkdir(os.path.join(self.tdir, str(count[0])))
return get_container(book, tdir=tdir)
# Test simple opf rename
c = new_container()
orig_name = c.opf_name
name = 'renamed opf.opf'
rename_files(c, {c.opf_name: name})
self.assertEqual(c.opf_name, name)
for x in ('name_path_map', 'mime_map'):
self.assertNotIn(orig_name, getattr(c, x))
self.assertIn(name, getattr(c, x))
self.assertNotIn(name, c.dirtied)
root = c.parsed('META-INF/container.xml')
vals = set(root.xpath(
r'child::ocf:rootfiles/ocf:rootfile/@full-path',
namespaces={'ocf':OCF_NS}))
self.assertSetEqual(vals, {name})
self.check_links(c)
# Test a rename that moves the OPF into different directory
c = new_container()
orig_name = c.opf_name
name = 'renamed/again/metadata.opf'
rename_files(c, {c.opf_name: name})
self.check_links(c)
# Test that renaming commits dirtied items
c = new_container()
name = next(c.spine_names)[0]
root = c.parsed(name)
root.xpath('//*[local-name()="body"]')[0].set('id', 'rename-dirty-test')
rename_files(c, {name:'other/' + name})
with c.open('other/' + name) as f:
raw = f.read()
self.assertIn(b'id="rename-dirty-test"', raw)
self.check_links(c)
# Test renaming of stylesheets
c = new_container()
rename_files(c, {'stylesheet.css':'styles/s 1.css', 'page_styles.css':'styles/p 1.css'})
self.check_links(c)
# Test renaming of images
c = new_container()
rename_files(c, {'cover.png':'images/cover img.png', 'light_wood.png':'images/light wood.png', 'marked.png':'images/marked img.png'})
self.check_links(c)
# Test renaming of ToC
c = new_container()
rename_files(c, {'toc.ncx': 'toc/toc file.ncx'})
self.check_links(c)
# Test renaming of font files
c = new_container()
rename_files(c, {'LiberationMono-Regular.ttf': 'fonts/LiberationMono Regular.ttf'})
self.check_links(c)
# Test renaming of text files
c = new_container()
rename_files(c, {'index_split_000.html':'text/page one.html', 'index_split_001.html':'text/page two.html'})
self.check_links(c)
# self.run_external_tools(c, gvim=True)

View File

@ -15,16 +15,17 @@
font-style: italic;
}
h1 {
h2 {
color: DarkCyan;
text-align: center;
background-image: url("IMAGE2");
}
p { font-family: "Liberation Mono"; }
</style>
</head>
<body>
<h2>A simple test page</h2>
<h2 id="page1">A simple test page</h2>
<!--lorem-->
<p>To pursue pleasure rationally encounter consequences that are extremely
painful.</p>
@ -39,11 +40,11 @@ avoids a pain that produces no resultant pleasure?</p>
<div style="text-align:center"><img alt="test" src="IMAGE1"></div>
<p>On the other hand.</p>
<p>A <a href="#page2">link to the next page</a>.</p>
<!--/lorem-->
<h2>Another test page</h2>
<h2 id="page2">Another test page</h2>
<!--lorem-->
<p>The great explorer of the truth, the master-builder of human happiness. No