ebook-polish: Update covers in epub

This commit is contained in:
Kovid Goyal 2013-02-12 10:27:47 +05:30
parent 9a0164059a
commit c91c1aeba2
4 changed files with 314 additions and 19 deletions

View File

@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import os, logging, sys, hashlib, uuid import os, logging, sys, hashlib, uuid
from urllib import unquote as urlunquote, quote as urlquote from urllib import unquote as urlunquote, quote as urlquote
from urlparse import urlparse
from lxml import etree from lxml import etree
@ -96,16 +97,22 @@ class Container(object):
def name_to_abspath(self, name): def name_to_abspath(self, name):
return os.path.abspath(join(self.root, *name.split('/'))) return os.path.abspath(join(self.root, *name.split('/')))
def exists(self, name):
return os.path.exists(self.name_to_abspath(name))
def href_to_name(self, href, base=None): def href_to_name(self, href, base=None):
''' '''
Convert an href (relative to base) to a name. base must be a name or Convert an href (relative to base) to a name. base must be a name or
None, in which self.root is used. None, in which case self.root is used.
''' '''
if base is None: if base is None:
base = self.root base = self.root
else: else:
base = os.path.dirname(self.name_to_abspath(base)) base = os.path.dirname(self.name_to_abspath(base))
href = urlunquote(href.partition('#')[0]) purl = urlparse(href)
if purl.scheme or not purl.path or purl.path.startswith('/'):
return None
href = urlunquote(purl.path)
fullpath = os.path.join(base, *href.split('/')) fullpath = os.path.join(base, *href.split('/'))
return self.abspath_to_name(fullpath) return self.abspath_to_name(fullpath)
@ -208,10 +215,19 @@ class Container(object):
return self.parsed(self.opf_name) return self.parsed(self.opf_name)
@property @property
def spine_items(self): def manifest_id_map(self):
manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name) return {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')} for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}
@property
def guide_type_map(self):
return {item.get('type', ''):self.href_to_name(item.get('href'), self.opf_name)
for item in self.opf_xpath('//opf:guide/opf:reference[@href and @type]')}
@property
def spine_items(self):
manifest_id_map = self.manifest_id_map
linear, non_linear = [], [] linear, non_linear = [], []
for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'): for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
idref = item.get('idref') idref = item.get('idref')
@ -251,8 +267,8 @@ class Container(object):
self.remove_from_xml(item) self.remove_from_xml(item)
self.dirty(self.opf_name) self.dirty(self.opf_name)
path = self.name_path_map.pop(name) path = self.name_path_map.pop(name, None)
if os.path.exists(path): if path and os.path.exists(path):
os.remove(path) os.remove(path)
self.mime_map.pop(name, None) self.mime_map.pop(name, None)
self.parsed_cache.pop(name, None) self.parsed_cache.pop(name, None)
@ -301,15 +317,24 @@ class Container(object):
if idx == len(parent)-1: if idx == len(parent)-1:
parent[idx-1].tail = parent.text parent[idx-1].tail = parent.text
def opf_get_or_create(self, name):
ans = self.opf_xpath('//opf:'+name)
if ans:
return ans[0]
self.dirty(self.opf_name)
package = self.opf_xpath('//opf:package')[0]
item = package.makeelement(OPF(name))
item.tail = '\n'
package.append(item)
return item
def generate_item(self, name, id_prefix=None, media_type=None): def generate_item(self, name, id_prefix=None, media_type=None):
'''Add an item to the manifest with href derived from the given '''Add an item to the manifest with href derived from the given
name. Ensures uniqueness of href and id automatically. Returns name. Ensures uniqueness of href and id automatically. Returns
generated item.''' generated item.'''
id_prefix = id_prefix or 'id' id_prefix = id_prefix or 'id'
media_type = media_type or guess_type(name)[0] media_type = media_type or guess_type(name)[0]
path = self.name_to_abspath(name) href = self.name_to_href(name, self.opf_name)
relpath = self.relpath(path, base=self.opf_dir)
href = urlquote(relpath)
base, ext = href.rpartition('.')[0::2] base, ext = href.rpartition('.')[0::2]
all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')} all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
c = 0 c = 0
@ -319,8 +344,12 @@ class Container(object):
item_id = id_prefix + '%d'%c item_id = id_prefix + '%d'%c
all_names = {x.get('href') for x in self.opf_xpath( all_names = {x.get('href') for x in self.opf_xpath(
'//opf:manifest/opf:item[@href]')} '//opf:manifest/opf:item[@href]')}
def exists(h):
return self.exists(self.href_to_name(h, self.opf_name))
c = 0 c = 0
while href in all_names: while href in all_names or exists(href):
c += 1 c += 1
href = '%s_%d.%s'%(base, c, ext) href = '%s_%d.%s'%(base, c, ext)
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
@ -329,16 +358,27 @@ class Container(object):
item.set('media-type', media_type) item.set('media-type', media_type)
self.insert_into_xml(manifest, item) self.insert_into_xml(manifest, item)
self.dirty(self.opf_name) self.dirty(self.opf_name)
name = self.href_to_name(href, self.opf_name)
self.name_path_map[name] = self.name_to_abspath(name)
self.mime_map[name] = media_type
return item return item
def commit(self, outpath=None): def commit_item(self, name):
for name in tuple(self.dirtied):
self.dirtied.remove(name) self.dirtied.remove(name)
data = self.parsed_cache.pop(name) data = self.parsed_cache.pop(name)
data = serialize(data, self.mime_map[name]) data = serialize(data, self.mime_map[name])
with open(self.name_path_map[name], 'wb') as f: with open(self.name_path_map[name], 'wb') as f:
f.write(data) f.write(data)
def open(self, name, mode='rb'):
if name in self.dirtied:
self.commit_item(name)
return open(self.name_to_abspath(name), mode)
def commit(self, outpath=None):
for name in tuple(self.dirtied):
self.commit_item(name)
def compare_to(self, other): def compare_to(self, other):
if set(self.name_path_map) != set(other.name_path_map): if set(self.name_path_map) != set(other.name_path_map):
return 'Set of files is not the same' return 'Set of files is not the same'

View File

@ -7,9 +7,10 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import shutil import shutil, re, os
from calibre.ebooks.oeb.base import OPF from calibre.ebooks.oeb.base import OPF, OEB_DOCS, XPath, XLINK, xml2text
from calibre.ebooks.oeb.polish.replace import replace_links
def set_azw3_cover(container, cover_path, report): def set_azw3_cover(container, cover_path, report):
name = None name = None
@ -33,4 +34,197 @@ def set_azw3_cover(container, cover_path, report):
def set_cover(container, cover_path, report): def set_cover(container, cover_path, report):
if container.book_type == 'azw3': if container.book_type == 'azw3':
set_azw3_cover(container, cover_path, report) set_azw3_cover(container, cover_path, report)
else:
set_epub_cover(container, cover_path, report)
###############################################################################
# The delightful EPUB cover processing
def is_raster_image(media_type):
return media_type and media_type.lower() in {
'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}
COVER_TYPES = { 'coverimagestandard', 'other.ms-coverimage-standard',
'other.ms-titleimage-standard', 'other.ms-titleimage',
'other.ms-coverimage', 'other.ms-thumbimage-standard',
'other.ms-thumbimage', 'thumbimagestandard', 'cover'}
def find_cover_image(container):
'Find a raster image marked as a cover in the OPF'
manifest_id_map = container.manifest_id_map
mm = container.mime_map
for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
item_id = meta.get('content')
name = manifest_id_map.get(item_id, None)
media_type = mm.get(name, None)
if is_raster_image(media_type):
return name
# First look for a guide item with type == 'cover'
guide_type_map = container.guide_type_map
for ref_type, name in guide_type_map.iteritems():
if ref_type.lower() == 'cover' and is_raster_image(mm.get(name, None)):
return name
# Find the largest image from all possible guide cover items
largest_cover = (None, 0)
for ref_type, name in guide_type_map.iteritems():
if ref_type.lower() in COVER_TYPES and is_raster_image(mm.get(name, None)):
path = container.name_path_map.get(name, None)
if path:
sz = os.path.getsize(path)
if sz > largest_cover[1]:
largest_cover = (name, sz)
if largest_cover[0]:
return largest_cover[0]
def find_cover_page(container):
'Find a document marked as a cover in the OPF'
mm = container.mime_map
guide_type_map = container.guide_type_map
for ref_type, name in guide_type_map.iteritems():
if ref_type.lower() == 'cover' and mm.get(name, '').lower() in OEB_DOCS:
return name
def find_cover_image_in_page(container, cover_page):
root = container.parsed(cover_page)
body = XPath('//h:body')(root)
if len(body) != 1: return
body = body[0]
images = []
for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
href = img.get('src') or img.get(XLINK('href'))
if href:
name = container.href_to_name(href, base=cover_page)
images.append(name)
text = re.sub(r'\s+', '', xml2text(body))
if text or len(images) > 1:
# Document has more content than a single image
return
if images:
return images[0]
def clean_opf(container):
'Remove all references to covers from the OPF'
manifest_id_map = container.manifest_id_map
for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'):
name = manifest_id_map.get(meta.get('content', None), None)
container.remove_from_xml(meta)
if name and name in container.name_path_map:
yield name
gtm = container.guide_type_map
for ref in container.opf_xpath('//opf:guide/opf:reference[@type]'):
typ = ref.get('type', '')
if typ.lower() in COVER_TYPES:
container.remove_from_xml(ref)
name = gtm.get(typ, None)
if name and name in container.name_path_map:
yield name
container.dirty(container.opf_name)
def create_epub_cover(container, cover_path):
from calibre.ebooks.conversion.config import load_defaults
from calibre.ebooks.oeb.transforms.cover import CoverManager
ext = cover_path.rpartition('.')[-1].lower()
raster_cover_item = container.generate_item('cover.'+ext, id_prefix='cover')
raster_cover = container.href_to_name(raster_cover_item.get('href'),
container.opf_name)
with open(cover_path, 'rb') as src, container.open(raster_cover, 'wb') as dest:
shutil.copyfileobj(src, dest)
opts = load_defaults('epub_output')
keep_aspect = opts.get('preserve_cover_aspect_ratio', False)
no_svg = opts.get('no_svg_cover', False)
if no_svg:
style = 'style="height: 100%%"'
templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style)
else:
width, height = 600, 800
ar = 'xMidYMid meet' if keep_aspect else 'none'
templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar)
templ = templ.replace('__viewbox__', '0 0 %d %d'%(width, height))
templ = templ.replace('__width__', str(width))
templ = templ.replace('__height__', str(height))
titlepage_item = container.generate_item('titlepage.xhtml',
id_prefix='titlepage')
titlepage = container.href_to_name(titlepage_item.get('href'),
container.opf_name)
raw = templ%container.name_to_href(raster_cover).encode('utf-8')
with container.open(titlepage, 'wb') as f:
f.write(raw)
spine = container.opf_xpath('//opf:spine')[0]
ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id'))
container.insert_into_xml(spine, ref, index=0)
guide = container.opf_get_or_create('guide')
container.insert_into_xml(guide, guide.makeelement(
OPF('reference'), type='cover', title=_('Cover'),
href=container.name_to_href(titlepage)))
metadata = container.opf_get_or_create('metadata')
meta = metadata.makeelement(OPF('meta'), name='cover')
meta.set('content', raster_cover_item.get('id'))
container.insert_into_xml(metadata, meta)
return raster_cover, titlepage
def set_epub_cover(container, cover_path, report):
cover_image = find_cover_image(container)
cover_page = find_cover_page(container)
wrapped_image = extra_cover_page = None
updated = False
possible_removals = set(clean_opf(container))
possible_removals
# TODO: Handle possible_removals and also iterate over links in the removed
# pages and handle possibly removing stylesheets referred to by them.
spine_items = tuple(container.spine_items)
if cover_page is None:
# Check if the first item in the spine is a simple cover wrapper
candidate = container.abspath_to_name(spine_items[0])
if find_cover_image_in_page(container, candidate) is not None:
cover_page = candidate
if cover_page is not None:
wrapped_image = find_cover_image_in_page(container, cover_page)
if len(spine_items) > 1:
# Look for an extra cover page
c = container.abspath_to_name(spine_items[1])
if c != cover_page:
candidate = find_cover_image_in_page(container, c)
if candidate and candidate in {wrapped_image, cover_image}:
# This page has only a single image and that image is the
# cover image, remove it.
container.remove_item(c)
extra_cover_page = c
spine_items = spine_items[:1] + spine_items[2:]
if wrapped_image is not None:
# The cover page is a simple wrapper around a single cover image,
# we can remove it safely.
container.remove_item(cover_page)
container.remove_item(wrapped_image)
updated = True
if cover_image and cover_image != wrapped_image:
# Remove the old cover image
container.remove_item(cover_image)
# Insert the new cover
raster_cover, titlepage = create_epub_cover(container, cover_path)
report('Cover updated' if updated else 'Cover inserted')
# Replace links to the old cover image/cover page
link_sub = {s:d for s, d in {
cover_page:titlepage, wrapped_image:raster_cover,
cover_image:raster_cover, extra_cover_page:titlepage}.iteritems()
if s is not None}
if link_sub:
replace_links(container, link_sub, frag_map=lambda x, y:None)

View File

@ -118,9 +118,9 @@ def option_parser():
a = parser.add_option a = parser.add_option
o = partial(a, default=False, action='store_true') o = partial(a, default=False, action='store_true')
o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset']) o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
a('--cover', help=_( a('--cover', '-c', help=_(
'Path to a cover image. Changes the cover specified in the ebook. ' 'Path to a cover image. Changes the cover specified in the ebook. '
'If no cover is present, inserts a new cover.')) 'If no cover is present, or the cover is not properly identified, inserts a new cover.'))
o('--verbose', help=_('Produce more verbose output, useful for debugging.')) o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
return parser return parser

View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from urlparse import urlparse
from cssutils import replaceUrls
from calibre import guess_type
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links)
class LinkReplacer(object):
def __init__(self, base, container, link_map, frag_map):
self.base = base
self.frag_map = frag_map
self.link_map = link_map
self.container = container
self.replaced = False
def __call__(self, url):
name = self.container.href_to_name(url, self.base)
if not name:
return url
nname = self.link_map.get(name, None)
if not nname:
return url
purl = urlparse(url)
href = self.container.name_to_href(nname, self.base)
if purl.fragment:
nfrag = self.frag_map(name, purl.fragment)
if nfrag:
href += '#%s'%nfrag
if href != url:
self.replaced = True
return href
def replace_links(container, link_map, frag_map=lambda name, frag:frag):
ncx_type = guess_type('toc.ncx')[0]
for name, media_type in container.mime_map.iteritems():
repl = LinkReplacer(name, container, link_map, frag_map)
if media_type.lower() in OEB_DOCS:
rewrite_links(container.parsed(name), repl)
elif media_type.lower() in OEB_STYLES:
replaceUrls(container.parsed(name), repl)
elif media_type.lower() == ncx_type:
for elem in container.parsed(name).xpath('//*[@src]'):
src = elem.get('src')
nsrc = repl(src)
if src != nsrc:
elem.set('src', nsrc)
if repl.replaced:
container.dirty(name)