Implement renaming in book containers

2025-07-09 03:04:10 -04:00 · 2013-10-21 14:29:53 +05:30 · 2013-10-21 14:29:53 +05:30 · f83313f593
commit f83313f593
parent f48d9a47ca
6 changed files with 278 additions and 32 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -100,6 +100,12 @@ def close_self_closing_tags(raw):
 def uuid_id():
    return 'u'+unicode(uuid.uuid4())

+def itercsslinks(raw):
+    for match in _css_url_re.finditer(raw):
+        yield match.group(1), match.start(1)
+    for match in _css_import_re.finditer(raw):
+        yield match.group(1), match.start(1)
+
 def iterlinks(root, find_links_in_css=True):
    '''
    Iterate over all links in a OEB Document.
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, logging, sys, hashlib, uuid, re, shutil, copy
+import os, logging, sys, hashlib, uuid, re, shutil
 from collections import defaultdict
 from io import BytesIO
 from urllib import unquote as urlunquote, quote as urlquote
@ -15,6 +15,7 @@ from urlparse import urlparse
 from future_builtins import zip

 from lxml import etree
+from cssutils import replaceUrls, getUrls

 from calibre import guess_type as _guess_type, CurrentDir
 from calibre.customize.ui import (plugin_for_input_format,
@ -27,7 +28,8 @@ from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.reader.headers import MetadataHeader
 from calibre.ebooks.mobi.tweak import set_cover
 from calibre.ebooks.oeb.base import (
-    serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
+    serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF,
+    rewrite_links, iterlinks, itercsslinks)
 from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
 from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@ -105,6 +107,7 @@ class Container(object):  # {{{
        self.encoding_map = {}
        self.pretty_print = set()
        self.cloned = False
+        self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')

        if clone_data is not None:
            self.cloned = True
@ -153,6 +156,100 @@ class Container(object):  # {{{
                for name, path in self.name_path_map.iteritems()}
        }

+    def rename(self, current_name, new_name):
+        ''' Renames a file from current_name to new_name. It automatically
+        rebases all links inside the file if the directory the file is in
+        changes. Note however, that links are not updated in the other files
+        that could reference this file. This is for performance, such updates
+        should be done once, in bulk. '''
+        if current_name in self.names_that_must_not_be_changed:
+            raise ValueError('Renaming of %s is not allowed' % current_name)
+        if self.exists(new_name):
+            raise ValueError('Cannot rename %s to %s as %s already exists' % (self.opf_name, new_name, new_name))
+        new_path = self.name_to_abspath(new_name)
+        base = os.path.dirname(new_path)
+        if os.path.isfile(base):
+            raise ValueError('Cannot rename %s to %s as %s is a file' % (self.opf_name, new_name, base))
+        if not os.path.exists(base):
+            os.makedirs(base)
+        old_path = parent_dir = self.name_to_abspath(current_name)
+        self.commit_item(current_name)
+        os.rename(old_path, new_path)
+        # Remove empty directories
+        while parent_dir:
+            parent_dir = os.path.dirname(parent_dir)
+            try:
+                os.rmdir(parent_dir)
+            except EnvironmentError:
+                break
+
+        for x in ('mime_map', 'encoding_map'):
+            x = getattr(self, x)
+            if current_name in x:
+                x[new_name] = x[current_name]
+        self.name_path_map[new_name] = new_path
+        for x in self.cache_names:
+            x = getattr(self, x)
+            try:
+                x.pop(current_name, None)
+            except TypeError:
+                x.discard(current_name)
+        if current_name == self.opf_name:
+            self.opf_name = new_name
+        if os.path.dirname(old_path) != os.path.dirname(new_path):
+            from calibre.ebooks.oeb.polish.replace import LinkRebaser
+            repl = LinkRebaser(self, current_name, new_name)
+            self.replace_links(new_name, repl)
+            self.dirty(new_name)
+
+    def replace_links(self, name, replace_func):
+        ''' Replace all links in name using replace_func, which must be a
+        callable that accepts a URL and returns the replaced URL. It must also
+        have a 'replaced' attribute that is set to True if any actual
+        replacement is done. Convenient ways of creating such callables are
+        using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
+        media_type = self.mime_map.get(name, guess_type(name))
+        if name == self.opf_name:
+            for elem in self.opf_xpath('//*[@href]'):
+                elem.set('href', replace_func(elem.get('href')))
+        elif media_type.lower() in OEB_DOCS:
+            rewrite_links(self.parsed(name), replace_func)
+        elif media_type.lower() in OEB_STYLES:
+            replaceUrls(self.parsed(name), replace_func)
+        elif media_type.lower() == guess_type('toc.ncx'):
+            for elem in self.parsed(name).xpath('//*[@src]'):
+                elem.set('src', replace_func(elem.get('src')))
+
+        if replace_func.replaced:
+            self.dirty(name)
+        return replace_func.replaced
+
+    def iterlinks(self, name, get_line_numbers=True):
+        ''' Iterate over all links in name. If get_line_numbers is True the
+        yields results of the form (link, line_number, offset). Where
+        line_number is the line_number at which the link occurs and offset is
+        the number of characters from the start of the line. Note that offset
+        could actually encompass several lines if not zero. '''
+        media_type = self.mime_map.get(name, guess_type(name))
+        if name == self.opf_name:
+            for elem in self.opf_xpath('//*[@href]'):
+                yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
+        elif media_type.lower() in OEB_DOCS:
+            for el, attr, link, pos in iterlinks(self.parsed(name)):
+                yield (link, el.sourceline, pos) if get_line_numbers else link
+        elif media_type.lower() in OEB_STYLES:
+            if get_line_numbers:
+                with self.open(name) as f:
+                    raw = self.decode(f.read())
+                    for link, offset in itercsslinks(raw):
+                        yield link, 0, offset
+            else:
+                for link in getUrls(self.parsed(name)):
+                    yield link
+        elif media_type.lower() == guess_type('toc.ncx'):
+            for elem in self.parsed(name).xpath('//*[@src]'):
+                yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
+
    def abspath_to_name(self, fullpath):
        return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')

@ -648,15 +745,15 @@ class EpubContainer(Container):
        container_path = join(self.root, 'META-INF', 'container.xml')
        if not exists(container_path):
            raise InvalidEpub('No META-INF/container.xml in epub')
-        self.container = etree.fromstring(open(container_path, 'rb').read())
-        opf_files = self.container.xpath((
+        container = etree.fromstring(open(container_path, 'rb').read())
+        opf_files = container.xpath((
            r'child::ocf:rootfiles/ocf:rootfile'
            '[@media-type="%s" and @full-path]'%guess_type('a.opf')
            ), namespaces={'ocf':OCF_NS}
        )
        if not opf_files:
            raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
-        opf_path = os.path.join(self.root, *opf_files[0].get('full-path').split('/'))
+        opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
        if not exists(opf_path):
            raise InvalidEpub('OPF file does not exist at location pointed to'
                    ' by META-INF/container.xml')
@ -666,14 +763,35 @@ class EpubContainer(Container):
        self.obfuscated_fonts = {}
        if 'META-INF/encryption.xml' in self.name_path_map:
            self.process_encryption()
+        self.parsed_cache['META-INF/container.xml'] = container

    def clone_data(self, dest_dir):
        ans = super(EpubContainer, self).clone_data(dest_dir)
        ans['pathtoepub'] = self.pathtoepub
        ans['obfuscated_fonts'] = self.obfuscated_fonts.copy()
-        ans['container'] = copy.deepcopy(self.container)
        return ans

+    def rename(self, old_name, new_name):
+        is_opf = old_name == self.opf_name
+        super(EpubContainer, self).rename(old_name, new_name)
+        if is_opf:
+            for elem in self.parsed('META-INF/container.xml').xpath((
+                r'child::ocf:rootfiles/ocf:rootfile'
+                '[@media-type="%s" and @full-path]'%guess_type('a.opf')
+                ), namespaces={'ocf':OCF_NS}
+            ):
+                # The asinine epubcheck cannot handle quoted filenames in
+                # container.xml
+                elem.set('full-path', self.opf_name)
+            self.dirty('META-INF/container.xml')
+        if old_name in self.obfuscated_fonts:
+            self.obfuscated_fonts[new_name] = self.obfuscated_fonts.pop(old_name)
+            enc = self.parsed('META-INF/encryption.xml')
+            for cr in enc.xpath('//*[local-name()="CipherReference" and @URI]'):
+                if self.href_to_name(cr.get('URI')) == old_name:
+                    cr.set('URI', self.name_to_href(new_name))
+                    self.dirty('META-INF/encryption.xml')
+
    @property
    def names_that_need_not_be_manifested(self):
        return super(EpubContainer, self).names_that_need_not_be_manifested | {'META-INF/' + x for x in self.META_INF}
--- a/src/calibre/ebooks/oeb/polish/replace.py
+++ b/src/calibre/ebooks/oeb/polish/replace.py
@ -10,11 +10,7 @@ __docformat__ = 'restructuredtext en'
 import codecs
 from urlparse import urlparse

-from cssutils import replaceUrls
-
 from calibre.ebooks.chardet import strip_encoding_declarations
-from calibre.ebooks.oeb.polish.container import guess_type
-from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)

 class LinkReplacer(object):

@ -26,6 +22,12 @@ class LinkReplacer(object):
        self.replaced = False

    def __call__(self, url):
+        if url and url.startswith('#'):
+            repl = self.frag_map(self.base, url[1:])
+            if not repl or repl == url[1:]:
+                return url
+            self.replaced = True
+            return '#' + repl
        name = self.container.href_to_name(url, self.base)
        if not name:
            return url
@ -42,23 +44,37 @@ class LinkReplacer(object):
            self.replaced = True
        return href

-def replace_links(container, link_map, frag_map=lambda name, frag:frag):
-    ncx_type = guess_type('toc.ncx')
-    for name, media_type in container.mime_map.iteritems():
-        repl = LinkReplacer(name, container, link_map, frag_map)
-        if media_type.lower() in OEB_DOCS:
-            rewrite_links(container.parsed(name), repl)
-        elif media_type.lower() in OEB_STYLES:
-            replaceUrls(container.parsed(name), repl)
-        elif media_type.lower() == ncx_type:
-            for elem in container.parsed(name).xpath('//*[@src]'):
-                src = elem.get('src')
-                nsrc = repl(src)
-                if src != nsrc:
-                    elem.set('src', nsrc)
+class LinkRebaser(object):

-        if repl.replaced:
-            container.dirty(name)
+    def __init__(self, container, old_name, new_name):
+        self.old_name, self.new_name = old_name, new_name
+        self.container = container
+        self.replaced = False
+
+    def __call__(self, url):
+        if url and url.startswith('#'):
+            return url
+        purl = urlparse(url)
+        frag = purl.fragment
+        name = self.container.href_to_name(url, self.old_name)
+        if not name:
+            return url
+        if name == self.old_name:
+            name = self.new_name
+        href = self.container.name_to_href(name, self.new_name)
+        if frag:
+            href += '#' + frag
+        if href != url:
+            self.replaced = True
+        return href
+
+
+def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
+    for name, media_type in container.mime_map.iteritems():
+        if name == container.opf_name and not replace_in_opf:
+            continue
+        repl = LinkReplacer(name, container, link_map, frag_map)
+        container.replace_links(name, repl)

 def smarten_punctuation(container, report):
    from calibre.ebooks.conversion.preprocess import smarten_punctuation
@ -83,3 +99,19 @@ def smarten_punctuation(container, report):
                m.getparent().remove(m)
            container.dirty(name)

+def rename_files(container, file_map):
+    overlap = set(file_map).intersection(set(file_map.itervalues()))
+    if overlap:
+        raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
+    for name, dest in file_map.iteritems():
+        if container.exists(dest):
+            raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
+    if len(tuple(file_map.itervalues())) != len(set(file_map.itervalues())):
+        raise ValueError('Cannot rename, the set of destination files contains duplicates')
+    link_map = {}
+    for current_name, new_name in file_map.iteritems():
+        container.rename(current_name, new_name)
+        if new_name != container.opf_name:  # OPF is handled by the container
+            link_map[current_name] = new_name
+    replace_links(container, link_map, replace_in_opf=True)
+
--- a/src/calibre/ebooks/oeb/polish/tests/base.py
+++ b/src/calibre/ebooks/oeb/polish/tests/base.py
@ -45,6 +45,7 @@ def get_simple_book(fmt='epub'):
        raw = raw.replace('LMONOI', P('fonts/liberation/LiberationMono-Italic.ttf'))
        raw = raw.replace('LMONO', P('fonts/liberation/LiberationMono-Regular.ttf'))
        raw = raw.replace('IMAGE1', I('marked.png'))
+        raw = raw.replace('IMAGE2', I('textures/light_wood.png'))
        try:
            with open(x, 'wb') as f:
                f.write(raw.encode('utf-8'))
@ -69,3 +70,10 @@ class BaseTest(unittest.TestCase):
        shutil.rmtree(self.tdir, ignore_errors=True)
        del self.tdir

+    def check_links(self, container):
+        for name in container.name_path_map:
+            for link in container.iterlinks(name, get_line_numbers=False):
+                dest = container.href_to_name(link, name)
+                if dest:
+                    self.assertTrue(container.exists(dest), 'The link %s in %s does not exist' % (link, name))
+
--- a/src/calibre/ebooks/oeb/polish/tests/container.py
+++ b/src/calibre/ebooks/oeb/polish/tests/container.py
@ -6,12 +6,14 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

-import os
+import os, subprocess

 from calibre.ebooks.oeb.polish.tests.base import BaseTest, get_simple_book

-from calibre.ebooks.oeb.polish.container import get_container, clone_container
+from calibre.ebooks.oeb.polish.container import get_container, clone_container, OCF_NS
+from calibre.ebooks.oeb.polish.replace import rename_files
 from calibre.utils.filenames import nlinks_file
+from calibre.ptempfile import TemporaryFile

 class ContainerTests(BaseTest):

@ -73,3 +75,82 @@ class ContainerTests(BaseTest):
        for x in files:
            self.assertNotIn(x, raw)

+    def run_external_tools(self, container, gvim=False, epubcheck=True):
+        with TemporaryFile(suffix='.epub', dir=self.tdir) as f:
+            container.commit(outpath=f)
+            if gvim:
+                subprocess.Popen(['gvim', '-f', f]).wait()
+            if epubcheck:
+                subprocess.Popen(['epubcheck', f]).wait()
+
+    def test_file_rename(self):
+        ' Test renaming of files '
+        book = get_simple_book()
+        count = [0]
+        def new_container():
+            count[0] += 1
+            tdir = os.mkdir(os.path.join(self.tdir, str(count[0])))
+            return get_container(book, tdir=tdir)
+
+        # Test simple opf rename
+        c = new_container()
+        orig_name = c.opf_name
+        name = 'renamed opf.opf'
+        rename_files(c, {c.opf_name: name})
+        self.assertEqual(c.opf_name, name)
+        for x in ('name_path_map', 'mime_map'):
+            self.assertNotIn(orig_name, getattr(c, x))
+            self.assertIn(name, getattr(c, x))
+        self.assertNotIn(name, c.dirtied)
+        root = c.parsed('META-INF/container.xml')
+        vals = set(root.xpath(
+            r'child::ocf:rootfiles/ocf:rootfile/@full-path',
+            namespaces={'ocf':OCF_NS}))
+        self.assertSetEqual(vals, {name})
+        self.check_links(c)
+
+        # Test a rename that moves the OPF into different directory
+        c = new_container()
+        orig_name = c.opf_name
+        name = 'renamed/again/metadata.opf'
+        rename_files(c, {c.opf_name: name})
+        self.check_links(c)
+
+        # Test that renaming commits dirtied items
+        c = new_container()
+        name = next(c.spine_names)[0]
+        root = c.parsed(name)
+        root.xpath('//*[local-name()="body"]')[0].set('id', 'rename-dirty-test')
+        rename_files(c, {name:'other/' + name})
+        with c.open('other/' + name) as f:
+            raw = f.read()
+        self.assertIn(b'id="rename-dirty-test"', raw)
+        self.check_links(c)
+
+        # Test renaming of stylesheets
+        c = new_container()
+        rename_files(c, {'stylesheet.css':'styles/s 1.css', 'page_styles.css':'styles/p 1.css'})
+        self.check_links(c)
+
+        # Test renaming of images
+        c = new_container()
+        rename_files(c, {'cover.png':'images/cover img.png', 'light_wood.png':'images/light wood.png', 'marked.png':'images/marked img.png'})
+        self.check_links(c)
+
+        # Test renaming of ToC
+        c = new_container()
+        rename_files(c, {'toc.ncx': 'toc/toc file.ncx'})
+        self.check_links(c)
+
+        # Test renaming of font files
+        c = new_container()
+        rename_files(c, {'LiberationMono-Regular.ttf': 'fonts/LiberationMono Regular.ttf'})
+        self.check_links(c)
+
+        # Test renaming of text files
+        c = new_container()
+        rename_files(c, {'index_split_000.html':'text/page one.html', 'index_split_001.html':'text/page two.html'})
+        self.check_links(c)
+
+        # self.run_external_tools(c, gvim=True)
+
--- a/src/calibre/ebooks/oeb/polish/tests/simple.html
+++ b/src/calibre/ebooks/oeb/polish/tests/simple.html
@ -15,16 +15,17 @@
    font-style: italic;
 }

-h1 {
+h2 {
    color: DarkCyan;
    text-align: center;
+    background-image: url("IMAGE2");
 }

 p { font-family: "Liberation Mono"; }
        </style>
 	</head>
 	<body>
-    <h2>A simple test page</h2>
+    <h2 id="page1">A simple test page</h2>
 <!--lorem-->
 <p>To pursue pleasure rationally encounter consequences that are extremely
 painful.</p>
@ -39,11 +40,11 @@ avoids a pain that produces no resultant pleasure?</p>

 <div style="text-align:center"><img alt="test" src="IMAGE1"></div>

-<p>On the other hand.</p>
+<p>A <a href="#page2">link to the next page</a>.</p>

 <!--/lorem-->

-<h2>Another test page</h2>
+<h2 id="page2">Another test page</h2>

 <!--lorem-->
 <p>The great explorer of the truth, the master-builder of human happiness. No