diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 009902138f..e0e7a665cd 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -100,6 +100,12 @@ def close_self_closing_tags(raw): def uuid_id(): return 'u'+unicode(uuid.uuid4()) +def itercsslinks(raw): + for match in _css_url_re.finditer(raw): + yield match.group(1), match.start(1) + for match in _css_import_re.finditer(raw): + yield match.group(1), match.start(1) + def iterlinks(root, find_links_in_css=True): ''' Iterate over all links in a OEB Document. diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 02c4df6937..785056aa2b 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, logging, sys, hashlib, uuid, re, shutil, copy +import os, logging, sys, hashlib, uuid, re, shutil from collections import defaultdict from io import BytesIO from urllib import unquote as urlunquote, quote as urlquote @@ -15,6 +15,7 @@ from urlparse import urlparse from future_builtins import zip from lxml import etree +from cssutils import replaceUrls, getUrls from calibre import guess_type as _guess_type, CurrentDir from calibre.customize.ui import (plugin_for_input_format, @@ -27,7 +28,8 @@ from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.tweak import set_cover from calibre.ebooks.oeb.base import ( - serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF) + serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF, + rewrite_links, iterlinks, itercsslinks) from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile @@ -105,6 +107,7 @@ class Container(object): # {{{ self.encoding_map = {} self.pretty_print = set() self.cloned = False + self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print') if clone_data is not None: self.cloned = True @@ -153,6 +156,100 @@ class Container(object): # {{{ for name, path in self.name_path_map.iteritems()} } + def rename(self, current_name, new_name): + ''' Renames a file from current_name to new_name. It automatically + rebases all links inside the file if the directory the file is in + changes. Note however, that links are not updated in the other files + that could reference this file. This is for performance, such updates + should be done once, in bulk. ''' + if current_name in self.names_that_must_not_be_changed: + raise ValueError('Renaming of %s is not allowed' % current_name) + if self.exists(new_name): + raise ValueError('Cannot rename %s to %s as %s already exists' % (self.opf_name, new_name, new_name)) + new_path = self.name_to_abspath(new_name) + base = os.path.dirname(new_path) + if os.path.isfile(base): + raise ValueError('Cannot rename %s to %s as %s is a file' % (self.opf_name, new_name, base)) + if not os.path.exists(base): + os.makedirs(base) + old_path = parent_dir = self.name_to_abspath(current_name) + self.commit_item(current_name) + os.rename(old_path, new_path) + # Remove empty directories + while parent_dir: + parent_dir = os.path.dirname(parent_dir) + try: + os.rmdir(parent_dir) + except EnvironmentError: + break + + for x in ('mime_map', 'encoding_map'): + x = getattr(self, x) + if current_name in x: + x[new_name] = x[current_name] + self.name_path_map[new_name] = new_path + for x in self.cache_names: + x = getattr(self, x) + try: + x.pop(current_name, None) + except TypeError: + x.discard(current_name) + if current_name == self.opf_name: + self.opf_name = new_name + if os.path.dirname(old_path) != os.path.dirname(new_path): + from calibre.ebooks.oeb.polish.replace import LinkRebaser + repl = LinkRebaser(self, current_name, new_name) + self.replace_links(new_name, repl) + self.dirty(new_name) + + def replace_links(self, name, replace_func): + ''' Replace all links in name using replace_func, which must be a + callable that accepts a URL and returns the replaced URL. It must also + have a 'replaced' attribute that is set to True if any actual + replacement is done. Convenient ways of creating such callables are + using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. ''' + media_type = self.mime_map.get(name, guess_type(name)) + if name == self.opf_name: + for elem in self.opf_xpath('//*[@href]'): + elem.set('href', replace_func(elem.get('href'))) + elif media_type.lower() in OEB_DOCS: + rewrite_links(self.parsed(name), replace_func) + elif media_type.lower() in OEB_STYLES: + replaceUrls(self.parsed(name), replace_func) + elif media_type.lower() == guess_type('toc.ncx'): + for elem in self.parsed(name).xpath('//*[@src]'): + elem.set('src', replace_func(elem.get('src'))) + + if replace_func.replaced: + self.dirty(name) + return replace_func.replaced + + def iterlinks(self, name, get_line_numbers=True): + ''' Iterate over all links in name. If get_line_numbers is True the + yields results of the form (link, line_number, offset). Where + line_number is the line_number at which the link occurs and offset is + the number of characters from the start of the line. Note that offset + could actually encompass several lines if not zero. ''' + media_type = self.mime_map.get(name, guess_type(name)) + if name == self.opf_name: + for elem in self.opf_xpath('//*[@href]'): + yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') + elif media_type.lower() in OEB_DOCS: + for el, attr, link, pos in iterlinks(self.parsed(name)): + yield (link, el.sourceline, pos) if get_line_numbers else link + elif media_type.lower() in OEB_STYLES: + if get_line_numbers: + with self.open(name) as f: + raw = self.decode(f.read()) + for link, offset in itercsslinks(raw): + yield link, 0, offset + else: + for link in getUrls(self.parsed(name)): + yield link + elif media_type.lower() == guess_type('toc.ncx'): + for elem in self.parsed(name).xpath('//*[@src]'): + yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src') + def abspath_to_name(self, fullpath): return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/') @@ -648,15 +745,15 @@ class EpubContainer(Container): container_path = join(self.root, 'META-INF', 'container.xml') if not exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') - self.container = etree.fromstring(open(container_path, 'rb').read()) - opf_files = self.container.xpath(( + container = etree.fromstring(open(container_path, 'rb').read()) + opf_files = container.xpath(( r'child::ocf:rootfiles/ocf:rootfile' '[@media-type="%s" and @full-path]'%guess_type('a.opf') ), namespaces={'ocf':OCF_NS} ) if not opf_files: raise InvalidEpub('META-INF/container.xml contains no link to OPF file') - opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/')) + opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/'))) if not exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to' ' by META-INF/container.xml') @@ -666,14 +763,35 @@ class EpubContainer(Container): self.obfuscated_fonts = {} if 'META-INF/encryption.xml' in self.name_path_map: self.process_encryption() + self.parsed_cache['META-INF/container.xml'] = container def clone_data(self, dest_dir): ans = super(EpubContainer, self).clone_data(dest_dir) ans['pathtoepub'] = self.pathtoepub ans['obfuscated_fonts'] = self.obfuscated_fonts.copy() - ans['container'] = copy.deepcopy(self.container) return ans + def rename(self, old_name, new_name): + is_opf = old_name == self.opf_name + super(EpubContainer, self).rename(old_name, new_name) + if is_opf: + for elem in self.parsed('META-INF/container.xml').xpath(( + r'child::ocf:rootfiles/ocf:rootfile' + '[@media-type="%s" and @full-path]'%guess_type('a.opf') + ), namespaces={'ocf':OCF_NS} + ): + # The asinine epubcheck cannot handle quoted filenames in + # container.xml + elem.set('full-path', self.opf_name) + self.dirty('META-INF/container.xml') + if old_name in self.obfuscated_fonts: + self.obfuscated_fonts[new_name] = self.obfuscated_fonts.pop(old_name) + enc = self.parsed('META-INF/encryption.xml') + for cr in enc.xpath('//*[local-name()="CipherReference" and @URI]'): + if self.href_to_name(cr.get('URI')) == old_name: + cr.set('URI', self.name_to_href(new_name)) + self.dirty('META-INF/encryption.xml') + @property def names_that_need_not_be_manifested(self): return super(EpubContainer, self).names_that_need_not_be_manifested | {'META-INF/' + x for x in self.META_INF} diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py index b26589d5bf..a78f7b5ef7 100644 --- a/src/calibre/ebooks/oeb/polish/replace.py +++ b/src/calibre/ebooks/oeb/polish/replace.py @@ -10,11 +10,7 @@ __docformat__ = 'restructuredtext en' import codecs from urlparse import urlparse -from cssutils import replaceUrls - from calibre.ebooks.chardet import strip_encoding_declarations -from calibre.ebooks.oeb.polish.container import guess_type -from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links) class LinkReplacer(object): @@ -26,6 +22,12 @@ class LinkReplacer(object): self.replaced = False def __call__(self, url): + if url and url.startswith('#'): + repl = self.frag_map(self.base, url[1:]) + if not repl or repl == url[1:]: + return url + self.replaced = True + return '#' + repl name = self.container.href_to_name(url, self.base) if not name: return url @@ -42,23 +44,37 @@ class LinkReplacer(object): self.replaced = True return href -def replace_links(container, link_map, frag_map=lambda name, frag:frag): - ncx_type = guess_type('toc.ncx') - for name, media_type in container.mime_map.iteritems(): - repl = LinkReplacer(name, container, link_map, frag_map) - if media_type.lower() in OEB_DOCS: - rewrite_links(container.parsed(name), repl) - elif media_type.lower() in OEB_STYLES: - replaceUrls(container.parsed(name), repl) - elif media_type.lower() == ncx_type: - for elem in container.parsed(name).xpath('//*[@src]'): - src = elem.get('src') - nsrc = repl(src) - if src != nsrc: - elem.set('src', nsrc) +class LinkRebaser(object): - if repl.replaced: - container.dirty(name) + def __init__(self, container, old_name, new_name): + self.old_name, self.new_name = old_name, new_name + self.container = container + self.replaced = False + + def __call__(self, url): + if url and url.startswith('#'): + return url + purl = urlparse(url) + frag = purl.fragment + name = self.container.href_to_name(url, self.old_name) + if not name: + return url + if name == self.old_name: + name = self.new_name + href = self.container.name_to_href(name, self.new_name) + if frag: + href += '#' + frag + if href != url: + self.replaced = True + return href + + +def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False): + for name, media_type in container.mime_map.iteritems(): + if name == container.opf_name and not replace_in_opf: + continue + repl = LinkReplacer(name, container, link_map, frag_map) + container.replace_links(name, repl) def smarten_punctuation(container, report): from calibre.ebooks.conversion.preprocess import smarten_punctuation @@ -83,3 +99,19 @@ def smarten_punctuation(container, report): m.getparent().remove(m) container.dirty(name) +def rename_files(container, file_map): + overlap = set(file_map).intersection(set(file_map.itervalues())) + if overlap: + raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap)) + for name, dest in file_map.iteritems(): + if container.exists(dest): + raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest)) + if len(tuple(file_map.itervalues())) != len(set(file_map.itervalues())): + raise ValueError('Cannot rename, the set of destination files contains duplicates') + link_map = {} + for current_name, new_name in file_map.iteritems(): + container.rename(current_name, new_name) + if new_name != container.opf_name: # OPF is handled by the container + link_map[current_name] = new_name + replace_links(container, link_map, replace_in_opf=True) + diff --git a/src/calibre/ebooks/oeb/polish/tests/base.py b/src/calibre/ebooks/oeb/polish/tests/base.py index 4818127ac1..4a09c3749c 100644 --- a/src/calibre/ebooks/oeb/polish/tests/base.py +++ b/src/calibre/ebooks/oeb/polish/tests/base.py @@ -45,6 +45,7 @@ def get_simple_book(fmt='epub'): raw = raw.replace('LMONOI', P('fonts/liberation/LiberationMono-Italic.ttf')) raw = raw.replace('LMONO', P('fonts/liberation/LiberationMono-Regular.ttf')) raw = raw.replace('IMAGE1', I('marked.png')) + raw = raw.replace('IMAGE2', I('textures/light_wood.png')) try: with open(x, 'wb') as f: f.write(raw.encode('utf-8')) @@ -69,3 +70,10 @@ class BaseTest(unittest.TestCase): shutil.rmtree(self.tdir, ignore_errors=True) del self.tdir + def check_links(self, container): + for name in container.name_path_map: + for link in container.iterlinks(name, get_line_numbers=False): + dest = container.href_to_name(link, name) + if dest: + self.assertTrue(container.exists(dest), 'The link %s in %s does not exist' % (link, name)) + diff --git a/src/calibre/ebooks/oeb/polish/tests/container.py b/src/calibre/ebooks/oeb/polish/tests/container.py index 609766b6fd..4d854c32f7 100644 --- a/src/calibre/ebooks/oeb/polish/tests/container.py +++ b/src/calibre/ebooks/oeb/polish/tests/container.py @@ -6,12 +6,14 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import os +import os, subprocess from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book -from calibre.ebooks.oeb.polish.container import get_container, clone_container +from calibre.ebooks.oeb.polish.container import get_container, clone_container, OCF_NS +from calibre.ebooks.oeb.polish.replace import rename_files from calibre.utils.filenames import nlinks_file +from calibre.ptempfile import TemporaryFile class ContainerTests(BaseTest): @@ -73,3 +75,82 @@ class ContainerTests(BaseTest): for x in files: self.assertNotIn(x, raw) + def run_external_tools(self, container, gvim=False, epubcheck=True): + with TemporaryFile(suffix='.epub', dir=self.tdir) as f: + container.commit(outpath=f) + if gvim: + subprocess.Popen(['gvim', '-f', f]).wait() + if epubcheck: + subprocess.Popen(['epubcheck', f]).wait() + + def test_file_rename(self): + ' Test renaming of files ' + book = get_simple_book() + count = [0] + def new_container(): + count[0] += 1 + tdir = os.mkdir(os.path.join(self.tdir, str(count[0]))) + return get_container(book, tdir=tdir) + + # Test simple opf rename + c = new_container() + orig_name = c.opf_name + name = 'renamed opf.opf' + rename_files(c, {c.opf_name: name}) + self.assertEqual(c.opf_name, name) + for x in ('name_path_map', 'mime_map'): + self.assertNotIn(orig_name, getattr(c, x)) + self.assertIn(name, getattr(c, x)) + self.assertNotIn(name, c.dirtied) + root = c.parsed('META-INF/container.xml') + vals = set(root.xpath( + r'child::ocf:rootfiles/ocf:rootfile/@full-path', + namespaces={'ocf':OCF_NS})) + self.assertSetEqual(vals, {name}) + self.check_links(c) + + # Test a rename that moves the OPF into different directory + c = new_container() + orig_name = c.opf_name + name = 'renamed/again/metadata.opf' + rename_files(c, {c.opf_name: name}) + self.check_links(c) + + # Test that renaming commits dirtied items + c = new_container() + name = next(c.spine_names)[0] + root = c.parsed(name) + root.xpath('//*[local-name()="body"]')[0].set('id', 'rename-dirty-test') + rename_files(c, {name:'other/' + name}) + with c.open('other/' + name) as f: + raw = f.read() + self.assertIn(b'id="rename-dirty-test"', raw) + self.check_links(c) + + # Test renaming of stylesheets + c = new_container() + rename_files(c, {'stylesheet.css':'styles/s 1.css', 'page_styles.css':'styles/p 1.css'}) + self.check_links(c) + + # Test renaming of images + c = new_container() + rename_files(c, {'cover.png':'images/cover img.png', 'light_wood.png':'images/light wood.png', 'marked.png':'images/marked img.png'}) + self.check_links(c) + + # Test renaming of ToC + c = new_container() + rename_files(c, {'toc.ncx': 'toc/toc file.ncx'}) + self.check_links(c) + + # Test renaming of font files + c = new_container() + rename_files(c, {'LiberationMono-Regular.ttf': 'fonts/LiberationMono Regular.ttf'}) + self.check_links(c) + + # Test renaming of text files + c = new_container() + rename_files(c, {'index_split_000.html':'text/page one.html', 'index_split_001.html':'text/page two.html'}) + self.check_links(c) + + # self.run_external_tools(c, gvim=True) + diff --git a/src/calibre/ebooks/oeb/polish/tests/simple.html b/src/calibre/ebooks/oeb/polish/tests/simple.html index ce23f8591c..0d1638071f 100644 --- a/src/calibre/ebooks/oeb/polish/tests/simple.html +++ b/src/calibre/ebooks/oeb/polish/tests/simple.html @@ -15,16 +15,17 @@ font-style: italic; } -h1 { +h2 { color: DarkCyan; text-align: center; + background-image: url("IMAGE2"); } p { font-family: "Liberation Mono"; } -

A simple test page

+

A simple test page

To pursue pleasure rationally encounter consequences that are extremely painful.

@@ -39,11 +40,11 @@ avoids a pain that produces no resultant pleasure?

test
-

On the other hand.

+

A link to the next page.

-

Another test page

+

Another test page

The great explorer of the truth, the master-builder of human happiness. No