From 6aa1b67d8820c54dd43bf327a8c64077104d78d2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 10 Feb 2013 10:05:02 +0530
Subject: [PATCH] ebook-polish: Implementing cover setting in azw3 and fix
 various bugs in the container class

---
 src/calibre/ebooks/oeb/polish/container.py | 172 ++++++++++++++++-----
 src/calibre/ebooks/oeb/polish/cover.py     |  36 +++++
 src/calibre/ebooks/oeb/polish/main.py      |  17 +-
 src/calibre/ebooks/oeb/polish/subset.py    |   8 +-
 4 files changed, 191 insertions(+), 42 deletions(-)
 create mode 100644 src/calibre/ebooks/oeb/polish/cover.py
diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index d2b6ac85b5..08fae838c1 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 import os, logging, sys, hashlib, uuid
-from urllib import unquote as urlunquote
+from urllib import unquote as urlunquote, quote as urlquote
 
 from lxml import etree
 
@@ -22,8 +22,8 @@ from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcess
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.reader.headers import MetadataHeader
 from calibre.ebooks.mobi.tweak import set_cover
-from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
-                                     OEB_STYLES, OPF2_NS)
+from calibre.ebooks.oeb.base import (
+    serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
 from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
 from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@@ -34,9 +34,25 @@ from calibre.utils.zipfile import ZipFile
 exists, join, relpath = os.path.exists, os.path.join, os.path.relpath
 
 OEB_FONTS = {guess_type('a.ttf')[0], guess_type('b.ttf')[0]}
+OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS}
 
 class Container(object):
 
+    '''
+    A container represents an Open EBook as a directory full of files and an
+    opf file. There are two important concepts:
+
+        * The root directory. This is the base of the ebook. All the ebooks
+          files are inside this directory or in its sub-directories.
+
+        * Names: These are paths to the books' files relative to the root
+          directory. They always contain POSIX separators and are unquoted. They
+          can be thought of as canonical identifiers for files in the book.
+          Most methods on the container object work with names.
+    '''
+
+    book_type = 'oeb'
+
     def __init__(self, rootpath, opfpath, log):
         self.root = os.path.abspath(rootpath)
         self.log = log
@@ -54,7 +70,7 @@ class Container(object):
         for dirpath, _dirnames, filenames in os.walk(self.root):
             for f in filenames:
                 path = join(dirpath, f)
-                name = relpath(path, self.root).replace(os.sep, '/')
+                name = self.abspath_to_name(path)
                 self.name_path_map[name] = path
                 self.mime_map[name] = guess_type(path)[0]
                 # Special case if we have stumbled onto the opf
@@ -63,35 +79,52 @@ class Container(object):
                     self.opf_dir = os.path.dirname(path)
                     self.mime_map[name] = guess_type('a.opf')[0]
 
+        if not hasattr(self, 'opf_name'):
+            raise InvalidBook('Book has no OPF file')
+
         # Update mime map with data from the OPF
-        for item in self.opf.xpath(
-                '//opf:manifest/opf:item[@href and @media-type]',
-                namespaces={'opf':OPF2_NS}):
+        for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
             href = item.get('href')
-            self.mime_map[self.href_to_name(href)] = item.get('media-type')
+            self.mime_map[self.href_to_name(href, self.opf_name)] = item.get('media-type')
 
     def abspath_to_name(self, fullpath):
         return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
 
+    def name_to_abspath(self, name):
+        return os.path.abspath(join(self.root, *name.split('/')))
+
     def href_to_name(self, href, base=None):
         '''
-        Convert an href (relative to base) to a name (i.e. a path
-        relative to self.root with POSIX separators).
-
-        base must be an absolute path with OS separators or None, in which case
-        the href is interpreted relative to the dir containing the OPF.
+        Convert an href (relative to base) to a name. base must be a name or
+        None, in which self.root is used.
         '''
         if base is None:
-            base = self.opf_dir
+            base = self.root
+        else:
+            base = os.path.dirname(self.name_to_abspath(base))
         href = urlunquote(href.partition('#')[0])
         fullpath = os.path.join(base, *href.split('/'))
         return self.abspath_to_name(fullpath)
 
+    def name_to_href(self, name, base=None):
+        '''Convert a name to a href relative to base, which must be a name or
+        None in which case self.root is used as the base'''
+        fullpath = self.name_to_abspath(name)
+        basepath = self.root if base is None else os.path.dirname(self.name_to_abspath(base))
+        path = relpath(fullpath, basepath).replace(os.sep, '/')
+        return urlquote(path)
+
+    def opf_xpath(self, expr):
+        return self.opf.xpath(expr, namespaces=OPF_NAMESPACES)
+
     def has_name(self, name):
         return name in self.name_path_map
 
-    def relpath(self, path):
-        return relpath(path, self.root)
+    def relpath(self, path, base=None):
+        '''Convert an absolute path (with os separators) to a path relative to
+        base (defaults to self.root). The relative path is *not* a name. Use
+        abspath_to_name() for that.'''
+        return relpath(path, base or self.root)
 
     def decode(self, data):
         """Automatically decode :param:`data` into a `unicode` object."""
@@ -173,13 +206,11 @@ class Container(object):
 
     @property
     def spine_items(self):
-        manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'))
-            for item in self.opf.xpath('//opf:manifest/opf:item[@href and @id]',
-                namespaces={'opf':OPF2_NS})}
+        manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
+            for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}
 
         linear, non_linear = [], []
-        for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
-                                   namespaces={'opf':OPF2_NS}):
+        for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
             idref = item.get('idref')
             name = manifest_id_map.get(idref, None)
             path = self.name_path_map.get(name, None)
@@ -198,26 +229,23 @@ class Container(object):
         any internal caches.
         '''
         removed = set()
-        for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
-                                   namespaces={'opf':OPF2_NS}):
-            if self.href_to_name(elem.get('href')) == name:
+        for elem in self.opf_xpath('//opf:manifest/opf:item[@href]'):
+            if self.href_to_name(elem.get('href'), self.opf_name) == name:
                 id_ = elem.get('id', None)
                 if id_ is not None:
                     removed.add(id_)
-                elem.getparent().remove(elem)
+                self.remove_from_xml(elem)
                 self.dirty(self.opf_name)
         if removed:
-            for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
-                                    namespaces={'opf':OPF2_NS}):
+            for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
                 idref = item.get('idref')
                 if idref in removed:
-                    item.getparent().remove(item)
+                    self.remove_from_xml(item)
                     self.dirty(self.opf_name)
 
-        for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
-                                    namespaces={'opf':OPF2_NS}):
-            if self.href_to_name(item.get('href')) == name:
-                item.getparent().remove(item)
+        for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
+            if self.href_to_name(item.get('href'), self.opf_name) == name:
+                self.remove_from_xml(item)
                 self.dirty(self.opf_name)
 
         path = self.name_path_map.pop(name)
@@ -230,6 +258,76 @@ class Container(object):
     def dirty(self, name):
         self.dirtied.add(name)
 
+    def remove_from_xml(self, item):
+        'Removes item from parent, fixing indentation (works only with self closing items)'
+        parent = item.getparent()
+        idx = parent.index(item)
+        if idx == 0:
+            # We are removing the first item - only care about adjusting
+            # the tail if this was the only child
+            if len(parent) == 1:
+                parent.text = item.tail
+        else:
+            # Make sure the preceding item has this tail
+            parent[idx-1].tail = item.tail
+        parent.remove(item)
+        return item
+
+    def insert_into_xml(self, parent, item, index=None):
+        '''Insert item into parent (or append if index is None), fixing
+        indentation. Only works with self closing items.'''
+        if index is None:
+            parent.append(item)
+        else:
+            parent.insert(index, item)
+        idx = parent.index(item)
+        if idx == 0:
+            item.tail = parent.text
+            # If this is the only child of this parent element, we need a
+            # little extra work as we have gone from a self-closing <foo />
+            # element to <foo><item /></foo>
+            if len(parent) == 1:
+                sibling = parent.getprevious()
+                if sibling is None:
+                    # Give up!
+                    return
+                parent.text = sibling.text
+                item.tail = sibling.tail
+        else:
+            item.tail = parent[idx-1].tail
+            if idx == len(parent)-1:
+                parent[idx-1].tail = parent.text
+
+    def generate_item(self, name, id_prefix=None, media_type=None):
+        '''Add an item to the manifest with href derived from the given
+        name. Ensures uniqueness of href and id automatically. Returns
+        generated item.'''
+        id_prefix = id_prefix or 'id'
+        media_type = media_type or guess_type(name)[0]
+        path = self.name_to_abspath(name)
+        relpath = self.relpath(path, base=self.opf_dir)
+        href = urlquote(relpath)
+        base, ext = href.rpartition('.')[0::2]
+        all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
+        c = 0
+        item_id = id_prefix
+        while item_id in all_ids:
+            c += 1
+            item_id = id_prefix + '%d'%c
+        all_names = {x.get('href') for x in self.opf_xpath(
+                '//opf:manifest/opf:item[@href]')}
+        c = 0
+        while href in all_names:
+            c += 1
+            href = '%s_%d.%s'%(base, c, ext)
+        manifest = self.opf_xpath('//opf:manifest')[0]
+        item = manifest.makeelement(OPF('item'), nsmap=OPF_NAMESPACES,
+                                    id=item_id, href=href)
+        item.set('media-type', media_type)
+        self.insert_into_xml(manifest, item)
+        self.dirty(self.opf_name)
+        return item
+
     def commit(self, outpath=None):
         for name in tuple(self.dirtied):
             self.dirtied.remove(name)
@@ -257,6 +355,8 @@ OCF_NS = 'urn:oasis:names:tc:opendocument:xmlns:container'
 
 class EpubContainer(Container):
 
+    book_type = 'epub'
+
     META_INF = {
             'container.xml' : True,
             'manifest.xml' : False,
@@ -314,7 +414,7 @@ class EpubContainer(Container):
             if alg not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
                 raise DRMError()
             cr = em.getparent().xpath('descendant::*[local-name()="CipherReference" and @URI]')[0]
-            name = self.href_to_name(cr.get('URI'), self.root)
+            name = self.href_to_name(cr.get('URI'))
             path = self.name_path_map.get(name, None)
             if path is not None:
                 fonts[name] = alg
@@ -327,14 +427,14 @@ class EpubContainer(Container):
                 package_id = val
                 break
         if package_id is not None:
-            for elem in self.opf.xpath('//*[@id=%r]'%package_id):
+            for elem in self.opf_xpath('//*[@id=%r]'%package_id):
                 if elem.text:
                     unique_identifier = elem.text.rpartition(':')[-1]
                     break
         if unique_identifier is not None:
             idpf_key = hashlib.sha1(unique_identifier).digest()
         key = None
-        for item in self.opf.xpath('//*[local-name()="metadata"]/*'
+        for item in self.opf_xpath('//*[local-name()="metadata"]/*'
                                    '[local-name()="identifier"]'):
             scheme = None
             for xkey in item.attrib.keys():
@@ -397,6 +497,8 @@ def do_explode(path, dest):
 
 class AZW3Container(Container):
 
+    book_type = 'azw3'
+
     def __init__(self, pathtoazw3, log):
         self.pathtoazw3 = pathtoazw3
         tdir = self.root = PersistentTemporaryDirectory('_azw3_container')
diff --git a/src/calibre/ebooks/oeb/polish/cover.py b/src/calibre/ebooks/oeb/polish/cover.py
new file mode 100644
index 0000000000..2ad0e2bdfd
--- /dev/null
+++ b/src/calibre/ebooks/oeb/polish/cover.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import shutil
+
+from calibre.ebooks.oeb.base import OPF
+
+def set_azw3_cover(container, cover_path, report):
+    name = None
+    found = True
+    for gi in container.opf_xpath('//opf:guide/opf:reference[@href and contains(@type, "cover")]'):
+        href = gi.get('href')
+        name = container.href_to_name(href, container.opf_name)
+        container.remove_from_xml(gi)
+    if name is None or not container.has_name(name):
+        item = container.generate_item(name='cover.jpeg', id_prefix='cover')
+        name = container.href_to_name(item.get('href'), container.opf_name)
+        found = False
+    href = container.name_to_href(name, container.opf_name)
+    guide = container.opf_xpath('//opf:guide')[0]
+    container.insert_into_xml(guide, guide.makeelement(
+        OPF('reference'), href=href, type='cover'))
+    shutil.copyfile(cover_path, container.name_to_abspath(name))
+    container.dirty(container.opf_name)
+    report('Cover updated' if found else 'Cover inserted')
+
+def set_cover(container, cover_path, report):
+    if container.book_type == 'azw3':
+        set_azw3_cover(container, cover_path, report)
+
diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py
index 57cf570ed4..3473caacdb 100644
--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@@ -14,6 +14,7 @@ from functools import partial
 from calibre.ebooks.oeb.polish.container import get_container
 from calibre.ebooks.oeb.polish.stats import StatsCollector
 from calibre.ebooks.oeb.polish.subset import subset_all_fonts
+from calibre.ebooks.oeb.polish.cover import set_cover
 from calibre.utils.logging import Log
 
 ALL_OPTS = {
@@ -72,6 +73,7 @@ CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in HELP.iteritems()}
 # }}}
 
 def polish(file_map, opts, log, report):
+    rt = lambda x: report('\n### ' + x)
     for inbook, outbook in file_map.iteritems():
         report('Polishing: %s'%(inbook.rpartition('.')[-1].upper()))
         ebook = get_container(inbook, log)
@@ -80,10 +82,15 @@ def polish(file_map, opts, log, report):
             stats = StatsCollector(ebook)
 
         if opts.subset:
-            report('\n### Subsetting embedded fonts')
+            rt('Subsetting embedded fonts')
             subset_all_fonts(ebook, stats.font_stats, report)
             report('')
 
+        if opts.cover:
+            rt('Setting cover')
+            set_cover(ebook, opts.cover, report)
+            report('')
+
         ebook.commit(outbook)
 
 def gui_polish(data):
@@ -105,8 +112,12 @@ def option_parser():
     USAGE = '%prog [options] input_file [output_file]\n\n' + re.sub(
         r'<.*?>', '', CLI_HELP['about'])
     parser = OptionParser(usage=USAGE)
-    o = partial(parser.add_option, default=False, action='store_true')
+    a = parser.add_option
+    o = partial(a, default=False, action='store_true')
     o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
+    a('--cover', help=_(
+        'Path to a cover image. Changes the cover specified in the ebook. '
+        'If no cover is present, inserts a new cover.'))
     o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
 
     return parser
@@ -139,7 +150,7 @@ def main():
     report = []
     something = False
     for name in ALL_OPTS:
-        if name not in {'opf', 'cover'}:
+        if name not in {'opf', }:
             if getattr(popts, name):
                 something = True
 
diff --git a/src/calibre/ebooks/oeb/polish/subset.py b/src/calibre/ebooks/oeb/polish/subset.py
index eb4f4c4b1f..5ab9db7f5c 100644
--- a/src/calibre/ebooks/oeb/polish/subset.py
+++ b/src/calibre/ebooks/oeb/polish/subset.py
@@ -15,7 +15,7 @@ from calibre.ebooks.oeb.polish.container import OEB_FONTS
 from calibre.utils.fonts.sfnt.subset import subset
 from calibre.utils.fonts.utils import get_font_names
 
-def remove_font_face_rules(container, sheet, remove_names):
+def remove_font_face_rules(container, sheet, remove_names, base):
     changed = False
     for rule in tuple(sheet.cssRules):
         if rule.type != rule.FONT_FACE_RULE:
@@ -24,7 +24,7 @@ def remove_font_face_rules(container, sheet, remove_names):
             uri = rule.style.getProperty('src').propertyValue[0].uri
         except (IndexError, KeyError, AttributeError, TypeError, ValueError):
             continue
-        name = container.href_to_name(uri)
+        name = container.href_to_name(uri, base)
         if name in remove_names:
             sheet.deleteRule(rule)
             changed = True
@@ -65,13 +65,13 @@ def subset_all_fonts(container, font_stats, report):
         for name, mt in container.mime_map.iteritems():
             if mt in OEB_STYLES:
                 sheet = container.parsed(name)
-                if remove_font_face_rules(container, sheet, remove):
+                if remove_font_face_rules(container, sheet, remove, name):
                     container.dirty(name)
             elif mt in OEB_DOCS:
                 for style in XPath('//h:style')(container.parsed(name)):
                     if style.get('type', 'text/css') == 'text/css' and style.text:
                         sheet = container.parse_css(style.text, name)
-                        if remove_font_face_rules(container, sheet, remove):
+                        if remove_font_face_rules(container, sheet, remove, name):
                             style.text = sheet.cssText
                             container.dirty(name)
     if total_old > 0: