diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index c9cf3778c1..f2ada441fc 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en' import os, logging, sys, hashlib, uuid from urllib import unquote as urlunquote, quote as urlquote +from urlparse import urlparse from lxml import etree @@ -96,16 +97,22 @@ class Container(object): def name_to_abspath(self, name): return os.path.abspath(join(self.root, *name.split('/'))) + def exists(self, name): + return os.path.exists(self.name_to_abspath(name)) + def href_to_name(self, href, base=None): ''' Convert an href (relative to base) to a name. base must be a name or - None, in which self.root is used. + None, in which case self.root is used. ''' if base is None: base = self.root else: base = os.path.dirname(self.name_to_abspath(base)) - href = urlunquote(href.partition('#')[0]) + purl = urlparse(href) + if purl.scheme or not purl.path or purl.path.startswith('/'): + return None + href = urlunquote(purl.path) fullpath = os.path.join(base, *href.split('/')) return self.abspath_to_name(fullpath) @@ -208,10 +215,19 @@ class Container(object): return self.parsed(self.opf_name) @property - def spine_items(self): - manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name) + def manifest_id_map(self): + return {item.get('id'):self.href_to_name(item.get('href'), self.opf_name) for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')} + @property + def guide_type_map(self): + return {item.get('type', ''):self.href_to_name(item.get('href'), self.opf_name) + for item in self.opf_xpath('//opf:guide/opf:reference[@href and @type]')} + + @property + def spine_items(self): + manifest_id_map = self.manifest_id_map + linear, non_linear = [], [] for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'): idref = item.get('idref') @@ -251,8 +267,8 @@ class Container(object): self.remove_from_xml(item) self.dirty(self.opf_name) - path = self.name_path_map.pop(name) - if os.path.exists(path): + path = self.name_path_map.pop(name, None) + if path and os.path.exists(path): os.remove(path) self.mime_map.pop(name, None) self.parsed_cache.pop(name, None) @@ -301,15 +317,24 @@ class Container(object): if idx == len(parent)-1: parent[idx-1].tail = parent.text + def opf_get_or_create(self, name): + ans = self.opf_xpath('//opf:'+name) + if ans: + return ans[0] + self.dirty(self.opf_name) + package = self.opf_xpath('//opf:package')[0] + item = package.makeelement(OPF(name)) + item.tail = '\n' + package.append(item) + return item + def generate_item(self, name, id_prefix=None, media_type=None): '''Add an item to the manifest with href derived from the given name. Ensures uniqueness of href and id automatically. Returns generated item.''' id_prefix = id_prefix or 'id' media_type = media_type or guess_type(name)[0] - path = self.name_to_abspath(name) - relpath = self.relpath(path, base=self.opf_dir) - href = urlquote(relpath) + href = self.name_to_href(name, self.opf_name) base, ext = href.rpartition('.')[0::2] all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')} c = 0 @@ -319,8 +344,12 @@ class Container(object): item_id = id_prefix + '%d'%c all_names = {x.get('href') for x in self.opf_xpath( '//opf:manifest/opf:item[@href]')} + + def exists(h): + return self.exists(self.href_to_name(h, self.opf_name)) + c = 0 - while href in all_names: + while href in all_names or exists(href): c += 1 href = '%s_%d.%s'%(base, c, ext) manifest = self.opf_xpath('//opf:manifest')[0] @@ -329,15 +358,26 @@ class Container(object): item.set('media-type', media_type) self.insert_into_xml(manifest, item) self.dirty(self.opf_name) + name = self.href_to_name(href, self.opf_name) + self.name_path_map[name] = self.name_to_abspath(name) + self.mime_map[name] = media_type return item + def commit_item(self, name): + self.dirtied.remove(name) + data = self.parsed_cache.pop(name) + data = serialize(data, self.mime_map[name]) + with open(self.name_path_map[name], 'wb') as f: + f.write(data) + + def open(self, name, mode='rb'): + if name in self.dirtied: + self.commit_item(name) + return open(self.name_to_abspath(name), mode) + def commit(self, outpath=None): for name in tuple(self.dirtied): - self.dirtied.remove(name) - data = self.parsed_cache.pop(name) - data = serialize(data, self.mime_map[name]) - with open(self.name_path_map[name], 'wb') as f: - f.write(data) + self.commit_item(name) def compare_to(self, other): if set(self.name_path_map) != set(other.name_path_map): diff --git a/src/calibre/ebooks/oeb/polish/cover.py b/src/calibre/ebooks/oeb/polish/cover.py index 2ad0e2bdfd..78a67d4df8 100644 --- a/src/calibre/ebooks/oeb/polish/cover.py +++ b/src/calibre/ebooks/oeb/polish/cover.py @@ -7,9 +7,10 @@ __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import shutil +import shutil, re, os -from calibre.ebooks.oeb.base import OPF +from calibre.ebooks.oeb.base import OPF, OEB_DOCS, XPath, XLINK, xml2text +from calibre.ebooks.oeb.polish.replace import replace_links def set_azw3_cover(container, cover_path, report): name = None @@ -33,4 +34,197 @@ def set_azw3_cover(container, cover_path, report): def set_cover(container, cover_path, report): if container.book_type == 'azw3': set_azw3_cover(container, cover_path, report) + else: + set_epub_cover(container, cover_path, report) + +############################################################################### +# The delightful EPUB cover processing + +def is_raster_image(media_type): + return media_type and media_type.lower() in { + 'image/png', 'image/jpeg', 'image/jpg', 'image/gif'} + +COVER_TYPES = { 'coverimagestandard', 'other.ms-coverimage-standard', + 'other.ms-titleimage-standard', 'other.ms-titleimage', + 'other.ms-coverimage', 'other.ms-thumbimage-standard', + 'other.ms-thumbimage', 'thumbimagestandard', 'cover'} + +def find_cover_image(container): + 'Find a raster image marked as a cover in the OPF' + manifest_id_map = container.manifest_id_map + mm = container.mime_map + for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'): + item_id = meta.get('content') + name = manifest_id_map.get(item_id, None) + media_type = mm.get(name, None) + if is_raster_image(media_type): + return name + + # First look for a guide item with type == 'cover' + guide_type_map = container.guide_type_map + for ref_type, name in guide_type_map.iteritems(): + if ref_type.lower() == 'cover' and is_raster_image(mm.get(name, None)): + return name + + # Find the largest image from all possible guide cover items + largest_cover = (None, 0) + for ref_type, name in guide_type_map.iteritems(): + if ref_type.lower() in COVER_TYPES and is_raster_image(mm.get(name, None)): + path = container.name_path_map.get(name, None) + if path: + sz = os.path.getsize(path) + if sz > largest_cover[1]: + largest_cover = (name, sz) + + if largest_cover[0]: + return largest_cover[0] + +def find_cover_page(container): + 'Find a document marked as a cover in the OPF' + mm = container.mime_map + guide_type_map = container.guide_type_map + for ref_type, name in guide_type_map.iteritems(): + if ref_type.lower() == 'cover' and mm.get(name, '').lower() in OEB_DOCS: + return name + +def find_cover_image_in_page(container, cover_page): + root = container.parsed(cover_page) + body = XPath('//h:body')(root) + if len(body) != 1: return + body = body[0] + images = [] + for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body): + href = img.get('src') or img.get(XLINK('href')) + if href: + name = container.href_to_name(href, base=cover_page) + images.append(name) + text = re.sub(r'\s+', '', xml2text(body)) + if text or len(images) > 1: + # Document has more content than a single image + return + if images: + return images[0] + +def clean_opf(container): + 'Remove all references to covers from the OPF' + manifest_id_map = container.manifest_id_map + for meta in container.opf_xpath('//opf:meta[@name="cover" and @content]'): + name = manifest_id_map.get(meta.get('content', None), None) + container.remove_from_xml(meta) + if name and name in container.name_path_map: + yield name + + gtm = container.guide_type_map + for ref in container.opf_xpath('//opf:guide/opf:reference[@type]'): + typ = ref.get('type', '') + if typ.lower() in COVER_TYPES: + container.remove_from_xml(ref) + name = gtm.get(typ, None) + if name and name in container.name_path_map: + yield name + + container.dirty(container.opf_name) + +def create_epub_cover(container, cover_path): + from calibre.ebooks.conversion.config import load_defaults + from calibre.ebooks.oeb.transforms.cover import CoverManager + + ext = cover_path.rpartition('.')[-1].lower() + raster_cover_item = container.generate_item('cover.'+ext, id_prefix='cover') + raster_cover = container.href_to_name(raster_cover_item.get('href'), + container.opf_name) + with open(cover_path, 'rb') as src, container.open(raster_cover, 'wb') as dest: + shutil.copyfileobj(src, dest) + opts = load_defaults('epub_output') + keep_aspect = opts.get('preserve_cover_aspect_ratio', False) + no_svg = opts.get('no_svg_cover', False) + if no_svg: + style = 'style="height: 100%%"' + templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style) + else: + width, height = 600, 800 + ar = 'xMidYMid meet' if keep_aspect else 'none' + templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar) + templ = templ.replace('__viewbox__', '0 0 %d %d'%(width, height)) + templ = templ.replace('__width__', str(width)) + templ = templ.replace('__height__', str(height)) + titlepage_item = container.generate_item('titlepage.xhtml', + id_prefix='titlepage') + titlepage = container.href_to_name(titlepage_item.get('href'), + container.opf_name) + raw = templ%container.name_to_href(raster_cover).encode('utf-8') + with container.open(titlepage, 'wb') as f: + f.write(raw) + + spine = container.opf_xpath('//opf:spine')[0] + ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id')) + container.insert_into_xml(spine, ref, index=0) + guide = container.opf_get_or_create('guide') + container.insert_into_xml(guide, guide.makeelement( + OPF('reference'), type='cover', title=_('Cover'), + href=container.name_to_href(titlepage))) + metadata = container.opf_get_or_create('metadata') + meta = metadata.makeelement(OPF('meta'), name='cover') + meta.set('content', raster_cover_item.get('id')) + container.insert_into_xml(metadata, meta) + + return raster_cover, titlepage + +def set_epub_cover(container, cover_path, report): + cover_image = find_cover_image(container) + cover_page = find_cover_page(container) + wrapped_image = extra_cover_page = None + updated = False + + possible_removals = set(clean_opf(container)) + possible_removals + # TODO: Handle possible_removals and also iterate over links in the removed + # pages and handle possibly removing stylesheets referred to by them. + + spine_items = tuple(container.spine_items) + if cover_page is None: + # Check if the first item in the spine is a simple cover wrapper + candidate = container.abspath_to_name(spine_items[0]) + if find_cover_image_in_page(container, candidate) is not None: + cover_page = candidate + + if cover_page is not None: + wrapped_image = find_cover_image_in_page(container, cover_page) + + if len(spine_items) > 1: + # Look for an extra cover page + c = container.abspath_to_name(spine_items[1]) + if c != cover_page: + candidate = find_cover_image_in_page(container, c) + if candidate and candidate in {wrapped_image, cover_image}: + # This page has only a single image and that image is the + # cover image, remove it. + container.remove_item(c) + extra_cover_page = c + spine_items = spine_items[:1] + spine_items[2:] + + if wrapped_image is not None: + # The cover page is a simple wrapper around a single cover image, + # we can remove it safely. + container.remove_item(cover_page) + container.remove_item(wrapped_image) + updated = True + + if cover_image and cover_image != wrapped_image: + # Remove the old cover image + container.remove_item(cover_image) + + # Insert the new cover + raster_cover, titlepage = create_epub_cover(container, cover_path) + + report('Cover updated' if updated else 'Cover inserted') + + # Replace links to the old cover image/cover page + link_sub = {s:d for s, d in { + cover_page:titlepage, wrapped_image:raster_cover, + cover_image:raster_cover, extra_cover_page:titlepage}.iteritems() + if s is not None} + if link_sub: + replace_links(container, link_sub, frag_map=lambda x, y:None) + diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py index 0c83e477fe..3e17f0ebe5 100644 --- a/src/calibre/ebooks/oeb/polish/main.py +++ b/src/calibre/ebooks/oeb/polish/main.py @@ -118,9 +118,9 @@ def option_parser(): a = parser.add_option o = partial(a, default=False, action='store_true') o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset']) - a('--cover', help=_( + a('--cover', '-c', help=_( 'Path to a cover image. Changes the cover specified in the ebook. ' - 'If no cover is present, inserts a new cover.')) + 'If no cover is present, or the cover is not properly identified, inserts a new cover.')) o('--verbose', help=_('Produce more verbose output, useful for debugging.')) return parser diff --git a/src/calibre/ebooks/oeb/polish/replace.py b/src/calibre/ebooks/oeb/polish/replace.py new file mode 100644 index 0000000000..455fdccb63 --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/replace.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from urlparse import urlparse + +from cssutils import replaceUrls + +from calibre import guess_type +from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, rewrite_links) + +class LinkReplacer(object): + + def __init__(self, base, container, link_map, frag_map): + self.base = base + self.frag_map = frag_map + self.link_map = link_map + self.container = container + self.replaced = False + + def __call__(self, url): + name = self.container.href_to_name(url, self.base) + if not name: + return url + nname = self.link_map.get(name, None) + if not nname: + return url + purl = urlparse(url) + href = self.container.name_to_href(nname, self.base) + if purl.fragment: + nfrag = self.frag_map(name, purl.fragment) + if nfrag: + href += '#%s'%nfrag + if href != url: + self.replaced = True + return href + +def replace_links(container, link_map, frag_map=lambda name, frag:frag): + ncx_type = guess_type('toc.ncx')[0] + for name, media_type in container.mime_map.iteritems(): + repl = LinkReplacer(name, container, link_map, frag_map) + if media_type.lower() in OEB_DOCS: + rewrite_links(container.parsed(name), repl) + elif media_type.lower() in OEB_STYLES: + replaceUrls(container.parsed(name), repl) + elif media_type.lower() == ncx_type: + for elem in container.parsed(name).xpath('//*[@src]'): + src = elem.get('src') + nsrc = repl(src) + if src != nsrc: + elem.set('src', nsrc) + + if repl.replaced: + container.dirty(name) + +