diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 009902138f..e0e7a665cd 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -100,6 +100,12 @@ def close_self_closing_tags(raw):
def uuid_id():
return 'u'+unicode(uuid.uuid4())
+def itercsslinks(raw):
+ for match in _css_url_re.finditer(raw):
+ yield match.group(1), match.start(1)
+ for match in _css_import_re.finditer(raw):
+ yield match.group(1), match.start(1)
+
def iterlinks(root, find_links_in_css=True):
'''
Iterate over all links in a OEB Document.
diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index 02c4df6937..785056aa2b 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import os, logging, sys, hashlib, uuid, re, shutil, copy
+import os, logging, sys, hashlib, uuid, re, shutil
from collections import defaultdict
from io import BytesIO
from urllib import unquote as urlunquote, quote as urlquote
@@ -15,6 +15,7 @@ from urlparse import urlparse
from future_builtins import zip
from lxml import etree
+from cssutils import replaceUrls, getUrls
from calibre import guess_type as _guess_type, CurrentDir
from calibre.customize.ui import (plugin_for_input_format,
@@ -27,7 +28,8 @@ from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.mobi.tweak import set_cover
from calibre.ebooks.oeb.base import (
- serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
+ serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF,
+ rewrite_links, iterlinks, itercsslinks)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@@ -105,6 +107,7 @@ class Container(object): # {{{
self.encoding_map = {}
self.pretty_print = set()
self.cloned = False
+ self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')
if clone_data is not None:
self.cloned = True
@@ -153,6 +156,100 @@ class Container(object): # {{{
for name, path in self.name_path_map.iteritems()}
}
+ def rename(self, current_name, new_name):
+ ''' Renames a file from current_name to new_name. It automatically
+ rebases all links inside the file if the directory the file is in
+ changes. Note however, that links are not updated in the other files
+ that could reference this file. This is for performance, such updates
+ should be done once, in bulk. '''
+ if current_name in self.names_that_must_not_be_changed:
+ raise ValueError('Renaming of %s is not allowed' % current_name)
+ if self.exists(new_name):
+ raise ValueError('Cannot rename %s to %s as %s already exists' % (self.opf_name, new_name, new_name))
+ new_path = self.name_to_abspath(new_name)
+ base = os.path.dirname(new_path)
+ if os.path.isfile(base):
+ raise ValueError('Cannot rename %s to %s as %s is a file' % (self.opf_name, new_name, base))
+ if not os.path.exists(base):
+ os.makedirs(base)
+ old_path = parent_dir = self.name_to_abspath(current_name)
+ self.commit_item(current_name)
+ os.rename(old_path, new_path)
+ # Remove empty directories
+ while parent_dir:
+ parent_dir = os.path.dirname(parent_dir)
+ try:
+ os.rmdir(parent_dir)
+ except EnvironmentError:
+ break
+
+ for x in ('mime_map', 'encoding_map'):
+ x = getattr(self, x)
+ if current_name in x:
+ x[new_name] = x[current_name]
+ self.name_path_map[new_name] = new_path
+ for x in self.cache_names:
+ x = getattr(self, x)
+ try:
+ x.pop(current_name, None)
+ except TypeError:
+ x.discard(current_name)
+ if current_name == self.opf_name:
+ self.opf_name = new_name
+ if os.path.dirname(old_path) != os.path.dirname(new_path):
+ from calibre.ebooks.oeb.polish.replace import LinkRebaser
+ repl = LinkRebaser(self, current_name, new_name)
+ self.replace_links(new_name, repl)
+ self.dirty(new_name)
+
+ def replace_links(self, name, replace_func):
+ ''' Replace all links in name using replace_func, which must be a
+ callable that accepts a URL and returns the replaced URL. It must also
+ have a 'replaced' attribute that is set to True if any actual
+ replacement is done. Convenient ways of creating such callables are
+ using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
+ media_type = self.mime_map.get(name, guess_type(name))
+ if name == self.opf_name:
+ for elem in self.opf_xpath('//*[@href]'):
+ elem.set('href', replace_func(elem.get('href')))
+ elif media_type.lower() in OEB_DOCS:
+ rewrite_links(self.parsed(name), replace_func)
+ elif media_type.lower() in OEB_STYLES:
+ replaceUrls(self.parsed(name), replace_func)
+ elif media_type.lower() == guess_type('toc.ncx'):
+ for elem in self.parsed(name).xpath('//*[@src]'):
+ elem.set('src', replace_func(elem.get('src')))
+
+ if replace_func.replaced:
+ self.dirty(name)
+ return replace_func.replaced
+
+ def iterlinks(self, name, get_line_numbers=True):
+ ''' Iterate over all links in name. If get_line_numbers is True the
+ yields results of the form (link, line_number, offset). Where
+ line_number is the line_number at which the link occurs and offset is
+ the number of characters from the start of the line. Note that offset
+ could actually encompass several lines if not zero. '''
+ media_type = self.mime_map.get(name, guess_type(name))
+ if name == self.opf_name:
+ for elem in self.opf_xpath('//*[@href]'):
+ yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
+ elif media_type.lower() in OEB_DOCS:
+ for el, attr, link, pos in iterlinks(self.parsed(name)):
+ yield (link, el.sourceline, pos) if get_line_numbers else link
+ elif media_type.lower() in OEB_STYLES:
+ if get_line_numbers:
+ with self.open(name) as f:
+ raw = self.decode(f.read())
+ for link, offset in itercsslinks(raw):
+ yield link, 0, offset
+ else:
+ for link in getUrls(self.parsed(name)):
+ yield link
+ elif media_type.lower() == guess_type('toc.ncx'):
+ for elem in self.parsed(name).xpath('//*[@src]'):
+ yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
+
def abspath_to_name(self, fullpath):
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
@@ -648,15 +745,15 @@ class EpubContainer(Container):
container_path = join(self.root, 'META-INF', 'container.xml')
if not exists(container_path):
raise InvalidEpub('No META-INF/container.xml in epub')
- self.container = etree.fromstring(open(container_path, 'rb').read())
- opf_files = self.container.xpath((
+ container = etree.fromstring(open(container_path, 'rb').read())
+ opf_files = container.xpath((
r'child::ocf:rootfiles/ocf:rootfile'
'[@media-type="%s" and @full-path]'%guess_type('a.opf')
), namespaces={'ocf':OCF_NS}
)
if not opf_files:
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
- opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/'))
+ opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
if not exists(opf_path):
raise InvalidEpub('OPF file does not exist at location pointed to'
' by META-INF/container.xml')
@@ -666,14 +763,35 @@ class EpubContainer(Container):
self.obfuscated_fonts = {}
if 'META-INF/encryption.xml' in self.name_path_map:
self.process_encryption()
+ self.parsed_cache['META-INF/container.xml'] = container
def clone_data(self, dest_dir):
ans = super(EpubContainer, self).clone_data(dest_dir)
ans['pathtoepub'] = self.pathtoepub
ans['obfuscated_fonts'] = self.obfuscated_fonts.copy()
- ans['container'] = copy.deepcopy(self.container)
return ans
+ def rename(self, old_name, new_name):
+ is_opf = old_name == self.opf_name
+ super(EpubContainer, self).rename(old_name, new_name)
+ if is_opf:
+ for elem in self.parsed('META-INF/container.xml').xpath((
+ r'child::ocf:rootfiles/ocf:rootfile'
+ '[@media-type="%s" and @full-path]'%guess_type('a.opf')
+ ), namespaces={'ocf':OCF_NS}
+ ):
+ # The asinine epubcheck cannot handle quoted filenames in
+ # container.xml
+ elem.set('full-path', self.opf_name)
+ self.dirty('META-INF/container.xml')
+ if old_name in self.obfuscated_fonts:
+ self.obfuscated_fonts[new_name] = self.obfuscated_fonts.pop(old_name)
+ enc = self.parsed('META-INF/encryption.xml')
+ for cr in enc.xpath('//*[local-name()="CipherReference" and @URI]'):
+ if self.href_to_name(cr.get('URI')) == old_name:
+ cr.set('URI', self.name_to_href(new_name))
+ self.dirty('META-INF/encryption.xml')
+
@property
def names_that_need_not_be_manifested(self):
return super(EpubContainer, self).names_that_need_not_be_manifested | {'META-INF/' + x for x in self.META_INF}
diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py
index b26589d5bf..a78f7b5ef7 100644
--- a/src/calibre/ebooks/oeb/polish/replace.py
+++ b/src/calibre/ebooks/oeb/polish/replace.py
@@ -10,11 +10,7 @@ __docformat__ = 'restructuredtext en'
import codecs
from urlparse import urlparse
-from cssutils import replaceUrls
-
from calibre.ebooks.chardet import strip_encoding_declarations
-from calibre.ebooks.oeb.polish.container import guess_type
-from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
class LinkReplacer(object):
@@ -26,6 +22,12 @@ class LinkReplacer(object):
self.replaced = False
def __call__(self, url):
+ if url and url.startswith('#'):
+ repl = self.frag_map(self.base, url[1:])
+ if not repl or repl == url[1:]:
+ return url
+ self.replaced = True
+ return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
@@ -42,23 +44,37 @@ class LinkReplacer(object):
self.replaced = True
return href
-def replace_links(container, link_map, frag_map=lambda name, frag:frag):
- ncx_type = guess_type('toc.ncx')
- for name, media_type in container.mime_map.iteritems():
- repl = LinkReplacer(name, container, link_map, frag_map)
- if media_type.lower() in OEB_DOCS:
- rewrite_links(container.parsed(name), repl)
- elif media_type.lower() in OEB_STYLES:
- replaceUrls(container.parsed(name), repl)
- elif media_type.lower() == ncx_type:
- for elem in container.parsed(name).xpath('//*[@src]'):
- src = elem.get('src')
- nsrc = repl(src)
- if src != nsrc:
- elem.set('src', nsrc)
+class LinkRebaser(object):
- if repl.replaced:
- container.dirty(name)
+ def __init__(self, container, old_name, new_name):
+ self.old_name, self.new_name = old_name, new_name
+ self.container = container
+ self.replaced = False
+
+ def __call__(self, url):
+ if url and url.startswith('#'):
+ return url
+ purl = urlparse(url)
+ frag = purl.fragment
+ name = self.container.href_to_name(url, self.old_name)
+ if not name:
+ return url
+ if name == self.old_name:
+ name = self.new_name
+ href = self.container.name_to_href(name, self.new_name)
+ if frag:
+ href += '#' + frag
+ if href != url:
+ self.replaced = True
+ return href
+
+
+def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
+ for name, media_type in container.mime_map.iteritems():
+ if name == container.opf_name and not replace_in_opf:
+ continue
+ repl = LinkReplacer(name, container, link_map, frag_map)
+ container.replace_links(name, repl)
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
@@ -83,3 +99,19 @@ def smarten_punctuation(container, report):
m.getparent().remove(m)
container.dirty(name)
+def rename_files(container, file_map):
+ overlap = set(file_map).intersection(set(file_map.itervalues()))
+ if overlap:
+ raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
+ for name, dest in file_map.iteritems():
+ if container.exists(dest):
+ raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
+ if len(tuple(file_map.itervalues())) != len(set(file_map.itervalues())):
+ raise ValueError('Cannot rename, the set of destination files contains duplicates')
+ link_map = {}
+ for current_name, new_name in file_map.iteritems():
+ container.rename(current_name, new_name)
+ if new_name != container.opf_name: # OPF is handled by the container
+ link_map[current_name] = new_name
+ replace_links(container, link_map, replace_in_opf=True)
+
diff --git a/src/calibre/ebooks/oeb/polish/tests/base.py b/src/calibre/ebooks/oeb/polish/tests/base.py
index 4818127ac1..4a09c3749c 100644
--- a/src/calibre/ebooks/oeb/polish/tests/base.py
+++ b/src/calibre/ebooks/oeb/polish/tests/base.py
@@ -45,6 +45,7 @@ def get_simple_book(fmt='epub'):
raw = raw.replace('LMONOI', P('fonts/liberation/LiberationMono-Italic.ttf'))
raw = raw.replace('LMONO', P('fonts/liberation/LiberationMono-Regular.ttf'))
raw = raw.replace('IMAGE1', I('marked.png'))
+ raw = raw.replace('IMAGE2', I('textures/light_wood.png'))
try:
with open(x, 'wb') as f:
f.write(raw.encode('utf-8'))
@@ -69,3 +70,10 @@ class BaseTest(unittest.TestCase):
shutil.rmtree(self.tdir, ignore_errors=True)
del self.tdir
+ def check_links(self, container):
+ for name in container.name_path_map:
+ for link in container.iterlinks(name, get_line_numbers=False):
+ dest = container.href_to_name(link, name)
+ if dest:
+ self.assertTrue(container.exists(dest), 'The link %s in %s does not exist' % (link, name))
+
diff --git a/src/calibre/ebooks/oeb/polish/tests/container.py b/src/calibre/ebooks/oeb/polish/tests/container.py
index 609766b6fd..4d854c32f7 100644
--- a/src/calibre/ebooks/oeb/polish/tests/container.py
+++ b/src/calibre/ebooks/oeb/polish/tests/container.py
@@ -6,12 +6,14 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal '
-import os
+import os, subprocess
from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book
-from calibre.ebooks.oeb.polish.container import get_container, clone_container
+from calibre.ebooks.oeb.polish.container import get_container, clone_container, OCF_NS
+from calibre.ebooks.oeb.polish.replace import rename_files
from calibre.utils.filenames import nlinks_file
+from calibre.ptempfile import TemporaryFile
class ContainerTests(BaseTest):
@@ -73,3 +75,82 @@ class ContainerTests(BaseTest):
for x in files:
self.assertNotIn(x, raw)
+ def run_external_tools(self, container, gvim=False, epubcheck=True):
+ with TemporaryFile(suffix='.epub', dir=self.tdir) as f:
+ container.commit(outpath=f)
+ if gvim:
+ subprocess.Popen(['gvim', '-f', f]).wait()
+ if epubcheck:
+ subprocess.Popen(['epubcheck', f]).wait()
+
+ def test_file_rename(self):
+ ' Test renaming of files '
+ book = get_simple_book()
+ count = [0]
+ def new_container():
+ count[0] += 1
+ tdir = os.mkdir(os.path.join(self.tdir, str(count[0])))
+ return get_container(book, tdir=tdir)
+
+ # Test simple opf rename
+ c = new_container()
+ orig_name = c.opf_name
+ name = 'renamed opf.opf'
+ rename_files(c, {c.opf_name: name})
+ self.assertEqual(c.opf_name, name)
+ for x in ('name_path_map', 'mime_map'):
+ self.assertNotIn(orig_name, getattr(c, x))
+ self.assertIn(name, getattr(c, x))
+ self.assertNotIn(name, c.dirtied)
+ root = c.parsed('META-INF/container.xml')
+ vals = set(root.xpath(
+ r'child::ocf:rootfiles/ocf:rootfile/@full-path',
+ namespaces={'ocf':OCF_NS}))
+ self.assertSetEqual(vals, {name})
+ self.check_links(c)
+
+ # Test a rename that moves the OPF into different directory
+ c = new_container()
+ orig_name = c.opf_name
+ name = 'renamed/again/metadata.opf'
+ rename_files(c, {c.opf_name: name})
+ self.check_links(c)
+
+ # Test that renaming commits dirtied items
+ c = new_container()
+ name = next(c.spine_names)[0]
+ root = c.parsed(name)
+ root.xpath('//*[local-name()="body"]')[0].set('id', 'rename-dirty-test')
+ rename_files(c, {name:'other/' + name})
+ with c.open('other/' + name) as f:
+ raw = f.read()
+ self.assertIn(b'id="rename-dirty-test"', raw)
+ self.check_links(c)
+
+ # Test renaming of stylesheets
+ c = new_container()
+ rename_files(c, {'stylesheet.css':'styles/s 1.css', 'page_styles.css':'styles/p 1.css'})
+ self.check_links(c)
+
+ # Test renaming of images
+ c = new_container()
+ rename_files(c, {'cover.png':'images/cover img.png', 'light_wood.png':'images/light wood.png', 'marked.png':'images/marked img.png'})
+ self.check_links(c)
+
+ # Test renaming of ToC
+ c = new_container()
+ rename_files(c, {'toc.ncx': 'toc/toc file.ncx'})
+ self.check_links(c)
+
+ # Test renaming of font files
+ c = new_container()
+ rename_files(c, {'LiberationMono-Regular.ttf': 'fonts/LiberationMono Regular.ttf'})
+ self.check_links(c)
+
+ # Test renaming of text files
+ c = new_container()
+ rename_files(c, {'index_split_000.html':'text/page one.html', 'index_split_001.html':'text/page two.html'})
+ self.check_links(c)
+
+ # self.run_external_tools(c, gvim=True)
+
diff --git a/src/calibre/ebooks/oeb/polish/tests/simple.html b/src/calibre/ebooks/oeb/polish/tests/simple.html
index ce23f8591c..0d1638071f 100644
--- a/src/calibre/ebooks/oeb/polish/tests/simple.html
+++ b/src/calibre/ebooks/oeb/polish/tests/simple.html
@@ -15,16 +15,17 @@
font-style: italic;
}
-h1 {
+h2 {
color: DarkCyan;
text-align: center;
+ background-image: url("IMAGE2");
}
p { font-family: "Liberation Mono"; }
- A simple test page
+ A simple test page
To pursue pleasure rationally encounter consequences that are extremely
painful.
@@ -39,11 +40,11 @@ avoids a pain that produces no resultant pleasure?

-On the other hand.
+A link to the next page.
-Another test page
+Another test page
The great explorer of the truth, the master-builder of human happiness. No