ebook-polish: Update covers in epub

2025-11-13 18:16:59 -05:00 · 2013-02-12 10:27:47 +05:30 · 2013-02-12 10:27:47 +05:30 · c91c1aeba2
commit c91c1aeba2
parent 9a0164059a
4 changed files with 314 additions and 19 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
 import os, logging, sys, hashlib, uuid
 from urllib import unquote as urlunquote, quote as urlquote
 from urlparse import urlparse
 from lxml import etree
@ -96,16 +97,22 @@ class Container(object):
    def name_to_abspath(self, name):
        return os.path.abspath(join(self.root, *name.split('/')))
    def exists(self, name):
        return os.path.exists(self.name_to_abspath(name))
    def href_to_name(self, href, base=None):
        '''
        Convert an href (relative to base) to a name. base must be a name or
-        None, in which self.root is used.
+        None, in which case self.root is used.
        '''
        if base is None:
            base = self.root
        else:
            base = os.path.dirname(self.name_to_abspath(base))
-        href = urlunquote(href.partition('#')[0])
+        purl = urlparse(href)
        if purl.scheme or not purl.path or purl.path.startswith('/'):
            return None
        href = urlunquote(purl.path)
        fullpath = os.path.join(base, *href.split('/'))
        return self.abspath_to_name(fullpath)
@ -208,10 +215,19 @@ class Container(object):
        return self.parsed(self.opf_name)
    @property
-    def spine_items(self):
+    def manifest_id_map(self):
-        manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
+        return {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
            for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}
    @property
    def guide_type_map(self):
        return {item.get('type', ''):self.href_to_name(item.get('href'), self.opf_name)
            for item in self.opf_xpath('//opf:guide/opf:reference[@href and @type]')}
    @property
    def spine_items(self):
        manifest_id_map = self.manifest_id_map
        linear, non_linear = [], []
        for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
            idref = item.get('idref')
@ -251,8 +267,8 @@ class Container(object):
                self.remove_from_xml(item)
                self.dirty(self.opf_name)
-        path = self.name_path_map.pop(name)
+        path = self.name_path_map.pop(name, None)
-        if os.path.exists(path):
+        if path and os.path.exists(path):
            os.remove(path)
        self.mime_map.pop(name, None)
        self.parsed_cache.pop(name, None)
@ -301,15 +317,24 @@ class Container(object):
            if idx == len(parent)-1:
                parent[idx-1].tail = parent.text
    def opf_get_or_create(self, name):
        ans = self.opf_xpath('//opf:'+name)
        if ans:
            return ans[0]
        self.dirty(self.opf_name)
        package = self.opf_xpath('//opf:package')[0]
        item = package.makeelement(OPF(name))
        item.tail = '\n'
        package.append(item)
        return item
    def generate_item(self, name, id_prefix=None, media_type=None):
        '''Add an item to the manifest with href derived from the given
        name. Ensures uniqueness of href and id automatically. Returns
        generated item.'''
        id_prefix = id_prefix or 'id'
        media_type = media_type or guess_type(name)[0]
-        path = self.name_to_abspath(name)
+        href = self.name_to_href(name, self.opf_name)
        relpath = self.relpath(path, base=self.opf_dir)
        href = urlquote(relpath)
        base, ext = href.rpartition('.')[0::2]
        all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
        c = 0
@ -319,8 +344,12 @@ class Container(object):
            item_id = id_prefix + '%d'%c
        all_names = {x.get('href') for x in self.opf_xpath(
                '//opf:manifest/opf:item[@href]')}
        def exists(h):
            return self.exists(self.href_to_name(h, self.opf_name))
        c = 0
-        while href in all_names:
+        while href in all_names or exists(href):
            c += 1
            href = '%s_%d.%s'%(base, c, ext)
        manifest = self.opf_xpath('//opf:manifest')[0]
@ -329,16 +358,27 @@ class Container(object):
        item.set('media-type', media_type)
        self.insert_into_xml(manifest, item)
        self.dirty(self.opf_name)
        name = self.href_to_name(href, self.opf_name)
        self.name_path_map[name] = self.name_to_abspath(name)
        self.mime_map[name] = media_type
        return item
-    def commit(self, outpath=None):
+    def commit_item(self, name):
        for name in tuple(self.dirtied):
        self.dirtied.remove(name)
        data = self.parsed_cache.pop(name)
        data = serialize(data, self.mime_map[name])
        with open(self.name_path_map[name], 'wb') as f:
            f.write(data)
    def open(self, name, mode='rb'):
        if name in self.dirtied:
            self.commit_item(name)
        return open(self.name_to_abspath(name), mode)
    def commit(self, outpath=None):
        for name in tuple(self.dirtied):
            self.commit_item(name)
    def compare_to(self, other):
        if set(self.name_path_map) != set(other.name_path_map):
            return 'Set of files is not the same'
--- a/src/calibre/ebooks/oeb/polish/cover.py
+++ b/src/calibre/ebooks/oeb/polish/cover.py
@ -7,9 +7,10 @@ __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import shutil
+import shutil, re, os
-from calibre.ebooks.oeb.base import OPF
+from calibre.ebooks.oeb.base import OPF, OEB_DOCS, XPath, XLINK, xml2text
 from calibre.ebooks.oeb.polish.replace import replace_links
 def set_azw3_cover(container, cover_path, report):
    name = None
@ -33,4 +34,197 @@ def set_azw3_cover(container, cover_path, report):
 def set_cover(container, cover_path, report):
    if container.book_type == 'azw3':
        set_azw3_cover(container, cover_path, report)
    else:
        set_epub_cover(container, cover_path, report)
 ###############################################################################
 # The delightful EPUB cover processing
 def is_raster_image(media_type):
    return media_type and media_type.lower() in {
        'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}
 COVER_TYPES = {            'coverimagestandard', 'other.ms-coverimage-standard',
            'other.ms-titleimage-standard', 'other.ms-titleimage',
            'other.ms-coverimage', 'other.ms-thumbimage-standard',
            'other.ms-thumbimage', 'thumbimagestandard', 'cover'}
 def find_cover_image(container):
    'Find a raster image marked as a cover in the OPF'
    manifest_id_map = container.manifest_id_map
    mm = container.mime_map
    for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
        item_id = meta.get('content')
        name = manifest_id_map.get(item_id, None)
        media_type = mm.get(name, None)
        if is_raster_image(media_type):
            return name
    # First look for a guide item with type == 'cover'
    guide_type_map = container.guide_type_map
    for ref_type, name in guide_type_map.iteritems():
        if ref_type.lower() == 'cover' and is_raster_image(mm.get(name, None)):
            return name
    # Find the largest image from all possible guide cover items
    largest_cover = (None, 0)
    for ref_type, name in guide_type_map.iteritems():
        if ref_type.lower() in COVER_TYPES and is_raster_image(mm.get(name, None)):
            path = container.name_path_map.get(name, None)
            if path:
                sz = os.path.getsize(path)
                if sz > largest_cover[1]:
                    largest_cover = (name, sz)
    if largest_cover[0]:
        return largest_cover[0]
 def find_cover_page(container):
    'Find a document marked as a cover in the OPF'
    mm = container.mime_map
    guide_type_map = container.guide_type_map
    for ref_type, name in guide_type_map.iteritems():
        if ref_type.lower() == 'cover' and mm.get(name, '').lower() in OEB_DOCS:
            return name
 def find_cover_image_in_page(container, cover_page):
    root = container.parsed(cover_page)
    body = XPath('//h:body')(root)
    if len(body) != 1: return
    body = body[0]
    images = []
    for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
        href = img.get('src') or img.get(XLINK('href'))
        if href:
            name = container.href_to_name(href, base=cover_page)
            images.append(name)
    text = re.sub(r'\s+', '', xml2text(body))
    if text or len(images) > 1:
        # Document has more content than a single image
        return
    if images:
        return images[0]
 def clean_opf(container):
    'Remove all references to covers from the OPF'
    manifest_id_map = container.manifest_id_map
    for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
        name = manifest_id_map.get(meta.get('content', None), None)
        container.remove_from_xml(meta)
        if name and name in container.name_path_map:
            yield name
    gtm = container.guide_type_map
    for ref in container.opf_xpath('//opf:guide/opf:reference[@type]'):
        typ = ref.get('type', '')
        if typ.lower() in COVER_TYPES:
            container.remove_from_xml(ref)
            name = gtm.get(typ, None)
            if name and name in container.name_path_map:
                yield name
    container.dirty(container.opf_name)
 def create_epub_cover(container, cover_path):
    from calibre.ebooks.conversion.config import load_defaults
    from calibre.ebooks.oeb.transforms.cover import CoverManager
    ext = cover_path.rpartition('.')[-1].lower()
    raster_cover_item = container.generate_item('cover.'+ext, id_prefix='cover')
    raster_cover = container.href_to_name(raster_cover_item.get('href'),
                                          container.opf_name)
    with open(cover_path, 'rb') as src, container.open(raster_cover, 'wb') as dest:
        shutil.copyfileobj(src, dest)
    opts = load_defaults('epub_output')
    keep_aspect = opts.get('preserve_cover_aspect_ratio', False)
    no_svg = opts.get('no_svg_cover', False)
    if no_svg:
        style = 'style="height: 100%%"'
        templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style)
    else:
        width, height = 600, 800
        ar = 'xMidYMid meet' if keep_aspect else 'none'
        templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar)
        templ = templ.replace('__viewbox__', '0 0 %d %d'%(width, height))
        templ = templ.replace('__width__',  str(width))
        templ = templ.replace('__height__', str(height))
    titlepage_item = container.generate_item('titlepage.xhtml',
                                             id_prefix='titlepage')
    titlepage = container.href_to_name(titlepage_item.get('href'),
                                          container.opf_name)
    raw = templ%container.name_to_href(raster_cover).encode('utf-8')
    with container.open(titlepage, 'wb') as f:
        f.write(raw)
    spine = container.opf_xpath('//opf:spine')[0]
    ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id'))
    container.insert_into_xml(spine, ref, index=0)
    guide = container.opf_get_or_create('guide')
    container.insert_into_xml(guide, guide.makeelement(
        OPF('reference'), type='cover', title=_('Cover'),
        href=container.name_to_href(titlepage)))
    metadata = container.opf_get_or_create('metadata')
    meta = metadata.makeelement(OPF('meta'), name='cover')
    meta.set('content', raster_cover_item.get('id'))
    container.insert_into_xml(metadata, meta)
    return raster_cover, titlepage
 def set_epub_cover(container, cover_path, report):
    cover_image = find_cover_image(container)
    cover_page = find_cover_page(container)
    wrapped_image = extra_cover_page = None
    updated = False
    possible_removals = set(clean_opf(container))
    possible_removals
    # TODO: Handle possible_removals and also iterate over links in the removed
    # pages and handle possibly removing stylesheets referred to by them.
    spine_items = tuple(container.spine_items)
    if cover_page is None:
        # Check if the first item in the spine is a simple cover wrapper
        candidate = container.abspath_to_name(spine_items[0])
        if find_cover_image_in_page(container, candidate) is not None:
            cover_page = candidate
    if cover_page is not None:
        wrapped_image = find_cover_image_in_page(container, cover_page)
        if len(spine_items) > 1:
            # Look for an extra cover page
            c = container.abspath_to_name(spine_items[1])
            if c != cover_page:
                candidate = find_cover_image_in_page(container, c)
                if candidate and candidate in {wrapped_image, cover_image}:
                    # This page has only a single image and that image is the
                    # cover image, remove it.
                    container.remove_item(c)
                    extra_cover_page = c
                    spine_items = spine_items[:1] + spine_items[2:]
        if wrapped_image is not None:
            # The cover page is a simple wrapper around a single cover image,
            # we can remove it safely.
            container.remove_item(cover_page)
            container.remove_item(wrapped_image)
            updated = True
    if cover_image and cover_image != wrapped_image:
        # Remove the old cover image
        container.remove_item(cover_image)
    # Insert the new cover
    raster_cover, titlepage = create_epub_cover(container, cover_path)
    report('Cover updated' if updated else 'Cover inserted')
    # Replace links to the old cover image/cover page
    link_sub = {s:d for s, d in {
        cover_page:titlepage, wrapped_image:raster_cover,
        cover_image:raster_cover, extra_cover_page:titlepage}.iteritems()
        if s is not None}
    if link_sub:
        replace_links(container, link_sub, frag_map=lambda x, y:None)
--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@ -118,9 +118,9 @@ def option_parser():
    a = parser.add_option
    o = partial(a, default=False, action='store_true')
    o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
-    a('--cover', help=_(
+    a('--cover', '-c', help=_(
        'Path to a cover image. Changes the cover specified in the ebook. '
-        'If no cover is present, inserts a new cover.'))
+        'If no cover is present, or the cover is not properly identified, inserts a new cover.'))
    o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
    return parser
--- a/src/calibre/ebooks/oeb/polish/replace.py
+++ b/src/calibre/ebooks/oeb/polish/replace.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from urlparse import urlparse
 from cssutils import replaceUrls
 from calibre import guess_type
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
 class LinkReplacer(object):
    def __init__(self, base, container, link_map, frag_map):
        self.base = base
        self.frag_map = frag_map
        self.link_map = link_map
        self.container = container
        self.replaced = False
    def __call__(self, url):
        name = self.container.href_to_name(url, self.base)
        if not name:
            return url
        nname = self.link_map.get(name, None)
        if not nname:
            return url
        purl = urlparse(url)
        href = self.container.name_to_href(nname, self.base)
        if purl.fragment:
            nfrag = self.frag_map(name, purl.fragment)
            if nfrag:
                href += '#%s'%nfrag
        if href != url:
            self.replaced = True
        return href
 def replace_links(container, link_map, frag_map=lambda name, frag:frag):
    ncx_type = guess_type('toc.ncx')[0]
    for name, media_type in container.mime_map.iteritems():
        repl = LinkReplacer(name, container, link_map, frag_map)
        if media_type.lower() in OEB_DOCS:
            rewrite_links(container.parsed(name), repl)
        elif media_type.lower() in OEB_STYLES:
            replaceUrls(container.parsed(name), repl)
        elif media_type.lower() == ncx_type:
            for elem in container.parsed(name).xpath('//*[@src]'):
                src = elem.get('src')
                nsrc = repl(src)
                if src != nsrc:
                    elem.set('src', nsrc)
        if repl.replaced:
            container.dirty(name)