ebook-polish: Update covers in epub

This commit is contained in:
Kovid Goyal 2013-02-12 10:27:47 +05:30
parent 9a0164059a
commit c91c1aeba2
4 changed files with 314 additions and 19 deletions

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import os, logging, sys, hashlib, uuid
from urllib import unquote as urlunquote, quote as urlquote
from urlparse import urlparse
from lxml import etree
@ -96,16 +97,22 @@ class Container(object):
def name_to_abspath(self, name):
return os.path.abspath(join(self.root, *name.split('/')))
def exists(self, name):
return os.path.exists(self.name_to_abspath(name))
def href_to_name(self, href, base=None):
'''
Convert an href (relative to base) to a name. base must be a name or
None, in which self.root is used.
None, in which case self.root is used.
'''
if base is None:
base = self.root
else:
base = os.path.dirname(self.name_to_abspath(base))
href = urlunquote(href.partition('#')[0])
purl = urlparse(href)
if purl.scheme or not purl.path or purl.path.startswith('/'):
return None
href = urlunquote(purl.path)
fullpath = os.path.join(base, *href.split('/'))
return self.abspath_to_name(fullpath)
@ -208,10 +215,19 @@ class Container(object):
return self.parsed(self.opf_name)
@property
def spine_items(self):
manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
def manifest_id_map(self):
return {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}
@property
def guide_type_map(self):
return {item.get('type', ''):self.href_to_name(item.get('href'), self.opf_name)
for item in self.opf_xpath('//opf:guide/opf:reference[@href and @type]')}
@property
def spine_items(self):
manifest_id_map = self.manifest_id_map
linear, non_linear = [], []
for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
idref = item.get('idref')
@ -251,8 +267,8 @@ class Container(object):
self.remove_from_xml(item)
self.dirty(self.opf_name)
path = self.name_path_map.pop(name)
if os.path.exists(path):
path = self.name_path_map.pop(name, None)
if path and os.path.exists(path):
os.remove(path)
self.mime_map.pop(name, None)
self.parsed_cache.pop(name, None)
@ -301,15 +317,24 @@ class Container(object):
if idx == len(parent)-1:
parent[idx-1].tail = parent.text
def opf_get_or_create(self, name):
ans = self.opf_xpath('//opf:'+name)
if ans:
return ans[0]
self.dirty(self.opf_name)
package = self.opf_xpath('//opf:package')[0]
item = package.makeelement(OPF(name))
item.tail = '\n'
package.append(item)
return item
def generate_item(self, name, id_prefix=None, media_type=None):
'''Add an item to the manifest with href derived from the given
name. Ensures uniqueness of href and id automatically. Returns
generated item.'''
id_prefix = id_prefix or 'id'
media_type = media_type or guess_type(name)[0]
path = self.name_to_abspath(name)
relpath = self.relpath(path, base=self.opf_dir)
href = urlquote(relpath)
href = self.name_to_href(name, self.opf_name)
base, ext = href.rpartition('.')[0::2]
all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
c = 0
@ -319,8 +344,12 @@ class Container(object):
item_id = id_prefix + '%d'%c
all_names = {x.get('href') for x in self.opf_xpath(
'//opf:manifest/opf:item[@href]')}
def exists(h):
return self.exists(self.href_to_name(h, self.opf_name))
c = 0
while href in all_names:
while href in all_names or exists(href):
c += 1
href = '%s_%d.%s'%(base, c, ext)
manifest = self.opf_xpath('//opf:manifest')[0]
@ -329,15 +358,26 @@ class Container(object):
item.set('media-type', media_type)
self.insert_into_xml(manifest, item)
self.dirty(self.opf_name)
name = self.href_to_name(href, self.opf_name)
self.name_path_map[name] = self.name_to_abspath(name)
self.mime_map[name] = media_type
return item
def commit_item(self, name):
self.dirtied.remove(name)
data = self.parsed_cache.pop(name)
data = serialize(data, self.mime_map[name])
with open(self.name_path_map[name], 'wb') as f:
f.write(data)
def open(self, name, mode='rb'):
if name in self.dirtied:
self.commit_item(name)
return open(self.name_to_abspath(name), mode)
def commit(self, outpath=None):
for name in tuple(self.dirtied):
self.dirtied.remove(name)
data = self.parsed_cache.pop(name)
data = serialize(data, self.mime_map[name])
with open(self.name_path_map[name], 'wb') as f:
f.write(data)
self.commit_item(name)
def compare_to(self, other):
if set(self.name_path_map) != set(other.name_path_map):

View File

@ -7,9 +7,10 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import shutil
import shutil, re, os
from calibre.ebooks.oeb.base import OPF
from calibre.ebooks.oeb.base import OPF, OEB_DOCS, XPath, XLINK, xml2text
from calibre.ebooks.oeb.polish.replace import replace_links
def set_azw3_cover(container, cover_path, report):
name = None
@ -33,4 +34,197 @@ def set_azw3_cover(container, cover_path, report):
def set_cover(container, cover_path, report):
if container.book_type == 'azw3':
set_azw3_cover(container, cover_path, report)
else:
set_epub_cover(container, cover_path, report)
###############################################################################
# The delightful EPUB cover processing
def is_raster_image(media_type):
return media_type and media_type.lower() in {
'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}
COVER_TYPES = { 'coverimagestandard', 'other.ms-coverimage-standard',
'other.ms-titleimage-standard', 'other.ms-titleimage',
'other.ms-coverimage', 'other.ms-thumbimage-standard',
'other.ms-thumbimage', 'thumbimagestandard', 'cover'}
def find_cover_image(container):
'Find a raster image marked as a cover in the OPF'
manifest_id_map = container.manifest_id_map
mm = container.mime_map
for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
item_id = meta.get('content')
name = manifest_id_map.get(item_id, None)
media_type = mm.get(name, None)
if is_raster_image(media_type):
return name
# First look for a guide item with type == 'cover'
guide_type_map = container.guide_type_map
for ref_type, name in guide_type_map.iteritems():
if ref_type.lower() == 'cover' and is_raster_image(mm.get(name, None)):
return name
# Find the largest image from all possible guide cover items
largest_cover = (None, 0)
for ref_type, name in guide_type_map.iteritems():
if ref_type.lower() in COVER_TYPES and is_raster_image(mm.get(name, None)):
path = container.name_path_map.get(name, None)
if path:
sz = os.path.getsize(path)
if sz > largest_cover[1]:
largest_cover = (name, sz)
if largest_cover[0]:
return largest_cover[0]
def find_cover_page(container):
'Find a document marked as a cover in the OPF'
mm = container.mime_map
guide_type_map = container.guide_type_map
for ref_type, name in guide_type_map.iteritems():
if ref_type.lower() == 'cover' and mm.get(name, '').lower() in OEB_DOCS:
return name
def find_cover_image_in_page(container, cover_page):
root = container.parsed(cover_page)
body = XPath('//h:body')(root)
if len(body) != 1: return
body = body[0]
images = []
for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
href = img.get('src') or img.get(XLINK('href'))
if href:
name = container.href_to_name(href, base=cover_page)
images.append(name)
text = re.sub(r'\s+', '', xml2text(body))
if text or len(images) > 1:
# Document has more content than a single image
return
if images:
return images[0]
def clean_opf(container):
'Remove all references to covers from the OPF'
manifest_id_map = container.manifest_id_map
for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
name = manifest_id_map.get(meta.get('content', None), None)
container.remove_from_xml(meta)
if name and name in container.name_path_map:
yield name
gtm = container.guide_type_map
for ref in container.opf_xpath('//opf:guide/opf:reference[@type]'):
typ = ref.get('type', '')
if typ.lower() in COVER_TYPES:
container.remove_from_xml(ref)
name = gtm.get(typ, None)
if name and name in container.name_path_map:
yield name
container.dirty(container.opf_name)
def create_epub_cover(container, cover_path):
from calibre.ebooks.conversion.config import load_defaults
from calibre.ebooks.oeb.transforms.cover import CoverManager
ext = cover_path.rpartition('.')[-1].lower()
raster_cover_item = container.generate_item('cover.'+ext, id_prefix='cover')
raster_cover = container.href_to_name(raster_cover_item.get('href'),
container.opf_name)
with open(cover_path, 'rb') as src, container.open(raster_cover, 'wb') as dest:
shutil.copyfileobj(src, dest)
opts = load_defaults('epub_output')
keep_aspect = opts.get('preserve_cover_aspect_ratio', False)
no_svg = opts.get('no_svg_cover', False)
if no_svg:
style = 'style="height: 100%%"'
templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style)
else:
width, height = 600, 800
ar = 'xMidYMid meet' if keep_aspect else 'none'
templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar)
templ = templ.replace('__viewbox__', '0 0 %d %d'%(width, height))
templ = templ.replace('__width__', str(width))
templ = templ.replace('__height__', str(height))
titlepage_item = container.generate_item('titlepage.xhtml',
id_prefix='titlepage')
titlepage = container.href_to_name(titlepage_item.get('href'),
container.opf_name)
raw = templ%container.name_to_href(raster_cover).encode('utf-8')
with container.open(titlepage, 'wb') as f:
f.write(raw)
spine = container.opf_xpath('//opf:spine')[0]
ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id'))
container.insert_into_xml(spine, ref, index=0)
guide = container.opf_get_or_create('guide')
container.insert_into_xml(guide, guide.makeelement(
OPF('reference'), type='cover', title=_('Cover'),
href=container.name_to_href(titlepage)))
metadata = container.opf_get_or_create('metadata')
meta = metadata.makeelement(OPF('meta'), name='cover')
meta.set('content', raster_cover_item.get('id'))
container.insert_into_xml(metadata, meta)
return raster_cover, titlepage
def set_epub_cover(container, cover_path, report):
cover_image = find_cover_image(container)
cover_page = find_cover_page(container)
wrapped_image = extra_cover_page = None
updated = False
possible_removals = set(clean_opf(container))
possible_removals
# TODO: Handle possible_removals and also iterate over links in the removed
# pages and handle possibly removing stylesheets referred to by them.
spine_items = tuple(container.spine_items)
if cover_page is None:
# Check if the first item in the spine is a simple cover wrapper
candidate = container.abspath_to_name(spine_items[0])
if find_cover_image_in_page(container, candidate) is not None:
cover_page = candidate
if cover_page is not None:
wrapped_image = find_cover_image_in_page(container, cover_page)
if len(spine_items) > 1:
# Look for an extra cover page
c = container.abspath_to_name(spine_items[1])
if c != cover_page:
candidate = find_cover_image_in_page(container, c)
if candidate and candidate in {wrapped_image, cover_image}:
# This page has only a single image and that image is the
# cover image, remove it.
container.remove_item(c)
extra_cover_page = c
spine_items = spine_items[:1] + spine_items[2:]
if wrapped_image is not None:
# The cover page is a simple wrapper around a single cover image,
# we can remove it safely.
container.remove_item(cover_page)
container.remove_item(wrapped_image)
updated = True
if cover_image and cover_image != wrapped_image:
# Remove the old cover image
container.remove_item(cover_image)
# Insert the new cover
raster_cover, titlepage = create_epub_cover(container, cover_path)
report('Cover updated' if updated else 'Cover inserted')
# Replace links to the old cover image/cover page
link_sub = {s:d for s, d in {
cover_page:titlepage, wrapped_image:raster_cover,
cover_image:raster_cover, extra_cover_page:titlepage}.iteritems()
if s is not None}
if link_sub:
replace_links(container, link_sub, frag_map=lambda x, y:None)

View File

@ -118,9 +118,9 @@ def option_parser():
a = parser.add_option
o = partial(a, default=False, action='store_true')
o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
a('--cover', help=_(
a('--cover', '-c', help=_(
'Path to a cover image. Changes the cover specified in the ebook. '
'If no cover is present, inserts a new cover.'))
'If no cover is present, or the cover is not properly identified, inserts a new cover.'))
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
return parser

View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from urlparse import urlparse
from cssutils import replaceUrls
from calibre import guess_type
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
class LinkReplacer(object):
def __init__(self, base, container, link_map, frag_map):
self.base = base
self.frag_map = frag_map
self.link_map = link_map
self.container = container
self.replaced = False
def __call__(self, url):
name = self.container.href_to_name(url, self.base)
if not name:
return url
nname = self.link_map.get(name, None)
if not nname:
return url
purl = urlparse(url)
href = self.container.name_to_href(nname, self.base)
if purl.fragment:
nfrag = self.frag_map(name, purl.fragment)
if nfrag:
href += '#%s'%nfrag
if href != url:
self.replaced = True
return href
def replace_links(container, link_map, frag_map=lambda name, frag:frag):
ncx_type = guess_type('toc.ncx')[0]
for name, media_type in container.mime_map.iteritems():
repl = LinkReplacer(name, container, link_map, frag_map)
if media_type.lower() in OEB_DOCS:
rewrite_links(container.parsed(name), repl)
elif media_type.lower() in OEB_STYLES:
replaceUrls(container.parsed(name), repl)
elif media_type.lower() == ncx_type:
for elem in container.parsed(name).xpath('//*[@src]'):
src = elem.get('src')
nsrc = repl(src)
if src != nsrc:
elem.set('src', nsrc)
if repl.replaced:
container.dirty(name)