ebook-polish: Update covers in epub

2025-06-23 15:30:45 -04:00 · 2013-02-12 10:27:47 +05:30 · 2013-02-12 10:27:47 +05:30 · c91c1aeba2
commit c91c1aeba2
parent 9a0164059a
4 changed files with 314 additions and 19 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'

 import os, logging, sys, hashlib, uuid
 from urllib import unquote as urlunquote, quote as urlquote
+from urlparse import urlparse

 from lxml import etree

@ -96,16 +97,22 @@ class Container(object):
    def name_to_abspath(self, name):
        return os.path.abspath(join(self.root, *name.split('/')))

+    def exists(self, name):
+        return os.path.exists(self.name_to_abspath(name))
+
    def href_to_name(self, href, base=None):
        '''
        Convert an href (relative to base) to a name. base must be a name or
-        None, in which self.root is used.
+        None, in which case self.root is used.
        '''
        if base is None:
            base = self.root
        else:
            base = os.path.dirname(self.name_to_abspath(base))
-        href = urlunquote(href.partition('#')[0])
+        purl = urlparse(href)
+        if purl.scheme or not purl.path or purl.path.startswith('/'):
+            return None
+        href = urlunquote(purl.path)
        fullpath = os.path.join(base, *href.split('/'))
        return self.abspath_to_name(fullpath)

@ -208,10 +215,19 @@ class Container(object):
        return self.parsed(self.opf_name)

    @property
-    def spine_items(self):
-        manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
+    def manifest_id_map(self):
+        return {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
            for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}

+    @property
+    def guide_type_map(self):
+        return {item.get('type', ''):self.href_to_name(item.get('href'), self.opf_name)
+            for item in self.opf_xpath('//opf:guide/opf:reference[@href and @type]')}
+
+    @property
+    def spine_items(self):
+        manifest_id_map = self.manifest_id_map
+
        linear, non_linear = [], []
        for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
            idref = item.get('idref')
@ -251,8 +267,8 @@ class Container(object):
                self.remove_from_xml(item)
                self.dirty(self.opf_name)

-        path = self.name_path_map.pop(name)
-        if os.path.exists(path):
+        path = self.name_path_map.pop(name, None)
+        if path and os.path.exists(path):
            os.remove(path)
        self.mime_map.pop(name, None)
        self.parsed_cache.pop(name, None)
@ -301,15 +317,24 @@ class Container(object):
            if idx == len(parent)-1:
                parent[idx-1].tail = parent.text

+    def opf_get_or_create(self, name):
+        ans = self.opf_xpath('//opf:'+name)
+        if ans:
+            return ans[0]
+        self.dirty(self.opf_name)
+        package = self.opf_xpath('//opf:package')[0]
+        item = package.makeelement(OPF(name))
+        item.tail = '\n'
+        package.append(item)
+        return item
+
    def generate_item(self, name, id_prefix=None, media_type=None):
        '''Add an item to the manifest with href derived from the given
        name. Ensures uniqueness of href and id automatically. Returns
        generated item.'''
        id_prefix = id_prefix or 'id'
        media_type = media_type or guess_type(name)[0]
-        path = self.name_to_abspath(name)
-        relpath = self.relpath(path, base=self.opf_dir)
-        href = urlquote(relpath)
+        href = self.name_to_href(name, self.opf_name)
        base, ext = href.rpartition('.')[0::2]
        all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
        c = 0
@ -319,8 +344,12 @@ class Container(object):
            item_id = id_prefix + '%d'%c
        all_names = {x.get('href') for x in self.opf_xpath(
                '//opf:manifest/opf:item[@href]')}
+
+        def exists(h):
+            return self.exists(self.href_to_name(h, self.opf_name))
+
        c = 0
-        while href in all_names:
+        while href in all_names or exists(href):
            c += 1
            href = '%s_%d.%s'%(base, c, ext)
        manifest = self.opf_xpath('//opf:manifest')[0]
@ -329,15 +358,26 @@ class Container(object):
        item.set('media-type', media_type)
        self.insert_into_xml(manifest, item)
        self.dirty(self.opf_name)
+        name = self.href_to_name(href, self.opf_name)
+        self.name_path_map[name] = self.name_to_abspath(name)
+        self.mime_map[name] = media_type
        return item

+    def commit_item(self, name):
+        self.dirtied.remove(name)
+        data = self.parsed_cache.pop(name)
+        data = serialize(data, self.mime_map[name])
+        with open(self.name_path_map[name], 'wb') as f:
+            f.write(data)
+
+    def open(self, name, mode='rb'):
+        if name in self.dirtied:
+            self.commit_item(name)
+        return open(self.name_to_abspath(name), mode)
+
    def commit(self, outpath=None):
        for name in tuple(self.dirtied):
-            self.dirtied.remove(name)
-            data = self.parsed_cache.pop(name)
-            data = serialize(data, self.mime_map[name])
-            with open(self.name_path_map[name], 'wb') as f:
-                f.write(data)
+            self.commit_item(name)

    def compare_to(self, other):
        if set(self.name_path_map) != set(other.name_path_map):
--- a/src/calibre/ebooks/oeb/polish/cover.py
+++ b/src/calibre/ebooks/oeb/polish/cover.py
@ -7,9 +7,10 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import shutil
+import shutil, re, os

-from calibre.ebooks.oeb.base import OPF
+from calibre.ebooks.oeb.base import OPF, OEB_DOCS, XPath, XLINK, xml2text
+from calibre.ebooks.oeb.polish.replace import replace_links

 def set_azw3_cover(container, cover_path, report):
    name = None
@ -33,4 +34,197 @@ def set_azw3_cover(container, cover_path, report):
 def set_cover(container, cover_path, report):
    if container.book_type == 'azw3':
        set_azw3_cover(container, cover_path, report)
+    else:
+        set_epub_cover(container, cover_path, report)
+
+###############################################################################
+# The delightful EPUB cover processing
+
+def is_raster_image(media_type):
+    return media_type and media_type.lower() in {
+        'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}
+
+COVER_TYPES = {            'coverimagestandard', 'other.ms-coverimage-standard',
+            'other.ms-titleimage-standard', 'other.ms-titleimage',
+            'other.ms-coverimage', 'other.ms-thumbimage-standard',
+            'other.ms-thumbimage', 'thumbimagestandard', 'cover'}
+
+def find_cover_image(container):
+    'Find a raster image marked as a cover in the OPF'
+    manifest_id_map = container.manifest_id_map
+    mm = container.mime_map
+    for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
+        item_id = meta.get('content')
+        name = manifest_id_map.get(item_id, None)
+        media_type = mm.get(name, None)
+        if is_raster_image(media_type):
+            return name
+
+    # First look for a guide item with type == 'cover'
+    guide_type_map = container.guide_type_map
+    for ref_type, name in guide_type_map.iteritems():
+        if ref_type.lower() == 'cover' and is_raster_image(mm.get(name, None)):
+            return name
+
+    # Find the largest image from all possible guide cover items
+    largest_cover = (None, 0)
+    for ref_type, name in guide_type_map.iteritems():
+        if ref_type.lower() in COVER_TYPES and is_raster_image(mm.get(name, None)):
+            path = container.name_path_map.get(name, None)
+            if path:
+                sz = os.path.getsize(path)
+                if sz > largest_cover[1]:
+                    largest_cover = (name, sz)
+
+    if largest_cover[0]:
+        return largest_cover[0]
+
+def find_cover_page(container):
+    'Find a document marked as a cover in the OPF'
+    mm = container.mime_map
+    guide_type_map = container.guide_type_map
+    for ref_type, name in guide_type_map.iteritems():
+        if ref_type.lower() == 'cover' and mm.get(name, '').lower() in OEB_DOCS:
+            return name
+
+def find_cover_image_in_page(container, cover_page):
+    root = container.parsed(cover_page)
+    body = XPath('//h:body')(root)
+    if len(body) != 1: return
+    body = body[0]
+    images = []
+    for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
+        href = img.get('src') or img.get(XLINK('href'))
+        if href:
+            name = container.href_to_name(href, base=cover_page)
+            images.append(name)
+    text = re.sub(r'\s+', '', xml2text(body))
+    if text or len(images) > 1:
+        # Document has more content than a single image
+        return
+    if images:
+        return images[0]
+
+def clean_opf(container):
+    'Remove all references to covers from the OPF'
+    manifest_id_map = container.manifest_id_map
+    for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
+        name = manifest_id_map.get(meta.get('content', None), None)
+        container.remove_from_xml(meta)
+        if name and name in container.name_path_map:
+            yield name
+
+    gtm = container.guide_type_map
+    for ref in container.opf_xpath('//opf:guide/opf:reference[@type]'):
+        typ = ref.get('type', '')
+        if typ.lower() in COVER_TYPES:
+            container.remove_from_xml(ref)
+            name = gtm.get(typ, None)
+            if name and name in container.name_path_map:
+                yield name
+
+    container.dirty(container.opf_name)
+
+def create_epub_cover(container, cover_path):
+    from calibre.ebooks.conversion.config import load_defaults
+    from calibre.ebooks.oeb.transforms.cover import CoverManager
+
+    ext = cover_path.rpartition('.')[-1].lower()
+    raster_cover_item = container.generate_item('cover.'+ext, id_prefix='cover')
+    raster_cover = container.href_to_name(raster_cover_item.get('href'),
+                                          container.opf_name)
+    with open(cover_path, 'rb') as src, container.open(raster_cover, 'wb') as dest:
+        shutil.copyfileobj(src, dest)
+    opts = load_defaults('epub_output')
+    keep_aspect = opts.get('preserve_cover_aspect_ratio', False)
+    no_svg = opts.get('no_svg_cover', False)
+    if no_svg:
+        style = 'style="height: 100%%"'
+        templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style)
+    else:
+        width, height = 600, 800
+        ar = 'xMidYMid meet' if keep_aspect else 'none'
+        templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar)
+        templ = templ.replace('__viewbox__', '0 0 %d %d'%(width, height))
+        templ = templ.replace('__width__',  str(width))
+        templ = templ.replace('__height__', str(height))
+    titlepage_item = container.generate_item('titlepage.xhtml',
+                                             id_prefix='titlepage')
+    titlepage = container.href_to_name(titlepage_item.get('href'),
+                                          container.opf_name)
+    raw = templ%container.name_to_href(raster_cover).encode('utf-8')
+    with container.open(titlepage, 'wb') as f:
+        f.write(raw)
+
+    spine = container.opf_xpath('//opf:spine')[0]
+    ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id'))
+    container.insert_into_xml(spine, ref, index=0)
+    guide = container.opf_get_or_create('guide')
+    container.insert_into_xml(guide, guide.makeelement(
+        OPF('reference'), type='cover', title=_('Cover'),
+        href=container.name_to_href(titlepage)))
+    metadata = container.opf_get_or_create('metadata')
+    meta = metadata.makeelement(OPF('meta'), name='cover')
+    meta.set('content', raster_cover_item.get('id'))
+    container.insert_into_xml(metadata, meta)
+
+    return raster_cover, titlepage
+
+def set_epub_cover(container, cover_path, report):
+    cover_image = find_cover_image(container)
+    cover_page = find_cover_page(container)
+    wrapped_image = extra_cover_page = None
+    updated = False
+
+    possible_removals = set(clean_opf(container))
+    possible_removals
+    # TODO: Handle possible_removals and also iterate over links in the removed
+    # pages and handle possibly removing stylesheets referred to by them.
+
+    spine_items = tuple(container.spine_items)
+    if cover_page is None:
+        # Check if the first item in the spine is a simple cover wrapper
+        candidate = container.abspath_to_name(spine_items[0])
+        if find_cover_image_in_page(container, candidate) is not None:
+            cover_page = candidate
+
+    if cover_page is not None:
+        wrapped_image = find_cover_image_in_page(container, cover_page)
+
+        if len(spine_items) > 1:
+            # Look for an extra cover page
+            c = container.abspath_to_name(spine_items[1])
+            if c != cover_page:
+                candidate = find_cover_image_in_page(container, c)
+                if candidate and candidate in {wrapped_image, cover_image}:
+                    # This page has only a single image and that image is the
+                    # cover image, remove it.
+                    container.remove_item(c)
+                    extra_cover_page = c
+                    spine_items = spine_items[:1] + spine_items[2:]
+
+        if wrapped_image is not None:
+            # The cover page is a simple wrapper around a single cover image,
+            # we can remove it safely.
+            container.remove_item(cover_page)
+            container.remove_item(wrapped_image)
+            updated = True
+
+    if cover_image and cover_image != wrapped_image:
+        # Remove the old cover image
+        container.remove_item(cover_image)
+
+    # Insert the new cover
+    raster_cover, titlepage = create_epub_cover(container, cover_path)
+
+    report('Cover updated' if updated else 'Cover inserted')
+
+    # Replace links to the old cover image/cover page
+    link_sub = {s:d for s, d in {
+        cover_page:titlepage, wrapped_image:raster_cover,
+        cover_image:raster_cover, extra_cover_page:titlepage}.iteritems()
+        if s is not None}
+    if link_sub:
+        replace_links(container, link_sub, frag_map=lambda x, y:None)
+

--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@ -118,9 +118,9 @@ def option_parser():
    a = parser.add_option
    o = partial(a, default=False, action='store_true')
    o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
-    a('--cover', help=_(
+    a('--cover', '-c', help=_(
        'Path to a cover image. Changes the cover specified in the ebook. '
-        'If no cover is present, inserts a new cover.'))
+        'If no cover is present, or the cover is not properly identified, inserts a new cover.'))
    o('--verbose', help=_('Produce more verbose output, useful for debugging.'))

    return parser
--- a/src/calibre/ebooks/oeb/polish/replace.py
+++ b/src/calibre/ebooks/oeb/polish/replace.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from urlparse import urlparse
+
+from cssutils import replaceUrls
+
+from calibre import guess_type
+from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
+
+class LinkReplacer(object):
+
+    def __init__(self, base, container, link_map, frag_map):
+        self.base = base
+        self.frag_map = frag_map
+        self.link_map = link_map
+        self.container = container
+        self.replaced = False
+
+    def __call__(self, url):
+        name = self.container.href_to_name(url, self.base)
+        if not name:
+            return url
+        nname = self.link_map.get(name, None)
+        if not nname:
+            return url
+        purl = urlparse(url)
+        href = self.container.name_to_href(nname, self.base)
+        if purl.fragment:
+            nfrag = self.frag_map(name, purl.fragment)
+            if nfrag:
+                href += '#%s'%nfrag
+        if href != url:
+            self.replaced = True
+        return href
+
+def replace_links(container, link_map, frag_map=lambda name, frag:frag):
+    ncx_type = guess_type('toc.ncx')[0]
+    for name, media_type in container.mime_map.iteritems():
+        repl = LinkReplacer(name, container, link_map, frag_map)
+        if media_type.lower() in OEB_DOCS:
+            rewrite_links(container.parsed(name), repl)
+        elif media_type.lower() in OEB_STYLES:
+            replaceUrls(container.parsed(name), repl)
+        elif media_type.lower() == ncx_type:
+            for elem in container.parsed(name).xpath('//*[@src]'):
+                src = elem.get('src')
+                nsrc = repl(src)
+                if src != nsrc:
+                    elem.set('src', nsrc)
+
+        if repl.replaced:
+            container.dirty(name)
+
+