ebook-polish: Implementing cover setting in azw3 and fix various bugs in the container class

2025-07-09 03:04:10 -04:00 · 2013-02-10 10:05:02 +05:30 · 2013-02-10 10:05:02 +05:30 · 6aa1b67d88
commit 6aa1b67d88
parent 789f4ab01a
4 changed files with 191 additions and 42 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import os, logging, sys, hashlib, uuid
-from urllib import unquote as urlunquote
+from urllib import unquote as urlunquote, quote as urlquote

 from lxml import etree

@ -22,8 +22,8 @@ from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcess
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.reader.headers import MetadataHeader
 from calibre.ebooks.mobi.tweak import set_cover
-from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
-                                     OEB_STYLES, OPF2_NS)
+from calibre.ebooks.oeb.base import (
+    serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
 from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
 from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@ -34,9 +34,25 @@ from calibre.utils.zipfile import ZipFile
 exists, join, relpath = os.path.exists, os.path.join, os.path.relpath

 OEB_FONTS = {guess_type('a.ttf')[0], guess_type('b.ttf')[0]}
+OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS}

 class Container(object):

+    '''
+    A container represents an Open EBook as a directory full of files and an
+    opf file. There are two important concepts:
+
+        * The root directory. This is the base of the ebook. All the ebooks
+          files are inside this directory or in its sub-directories.
+
+        * Names: These are paths to the books' files relative to the root
+          directory. They always contain POSIX separators and are unquoted. They
+          can be thought of as canonical identifiers for files in the book.
+          Most methods on the container object work with names.
+    '''
+
+    book_type = 'oeb'
+
    def __init__(self, rootpath, opfpath, log):
        self.root = os.path.abspath(rootpath)
        self.log = log
@ -54,7 +70,7 @@ class Container(object):
        for dirpath, _dirnames, filenames in os.walk(self.root):
            for f in filenames:
                path = join(dirpath, f)
-                name = relpath(path, self.root).replace(os.sep, '/')
+                name = self.abspath_to_name(path)
                self.name_path_map[name] = path
                self.mime_map[name] = guess_type(path)[0]
                # Special case if we have stumbled onto the opf
@ -63,35 +79,52 @@ class Container(object):
                    self.opf_dir = os.path.dirname(path)
                    self.mime_map[name] = guess_type('a.opf')[0]

+        if not hasattr(self, 'opf_name'):
+            raise InvalidBook('Book has no OPF file')
+
        # Update mime map with data from the OPF
-        for item in self.opf.xpath(
-                '//opf:manifest/opf:item[@href and @media-type]',
-                namespaces={'opf':OPF2_NS}):
+        for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
            href = item.get('href')
-            self.mime_map[self.href_to_name(href)] = item.get('media-type')
+            self.mime_map[self.href_to_name(href, self.opf_name)] = item.get('media-type')

    def abspath_to_name(self, fullpath):
        return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')

+    def name_to_abspath(self, name):
+        return os.path.abspath(join(self.root, *name.split('/')))
+
    def href_to_name(self, href, base=None):
        '''
-        Convert an href (relative to base) to a name (i.e. a path
-        relative to self.root with POSIX separators).
-
-        base must be an absolute path with OS separators or None, in which case
-        the href is interpreted relative to the dir containing the OPF.
+        Convert an href (relative to base) to a name. base must be a name or
+        None, in which self.root is used.
        '''
        if base is None:
-            base = self.opf_dir
+            base = self.root
+        else:
+            base = os.path.dirname(self.name_to_abspath(base))
        href = urlunquote(href.partition('#')[0])
        fullpath = os.path.join(base, *href.split('/'))
        return self.abspath_to_name(fullpath)

+    def name_to_href(self, name, base=None):
+        '''Convert a name to a href relative to base, which must be a name or
+        None in which case self.root is used as the base'''
+        fullpath = self.name_to_abspath(name)
+        basepath = self.root if base is None else os.path.dirname(self.name_to_abspath(base))
+        path = relpath(fullpath, basepath).replace(os.sep, '/')
+        return urlquote(path)
+
+    def opf_xpath(self, expr):
+        return self.opf.xpath(expr, namespaces=OPF_NAMESPACES)
+
    def has_name(self, name):
        return name in self.name_path_map

-    def relpath(self, path):
-        return relpath(path, self.root)
+    def relpath(self, path, base=None):
+        '''Convert an absolute path (with os separators) to a path relative to
+        base (defaults to self.root). The relative path is *not* a name. Use
+        abspath_to_name() for that.'''
+        return relpath(path, base or self.root)

    def decode(self, data):
        """Automatically decode :param:`data` into a `unicode` object."""
@ -173,13 +206,11 @@ class Container(object):

    @property
    def spine_items(self):
-        manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'))
-            for item in self.opf.xpath('//opf:manifest/opf:item[@href and @id]',
-                namespaces={'opf':OPF2_NS})}
+        manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
+            for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}

        linear, non_linear = [], []
-        for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
-                                   namespaces={'opf':OPF2_NS}):
+        for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
            idref = item.get('idref')
            name = manifest_id_map.get(idref, None)
            path = self.name_path_map.get(name, None)
@ -198,26 +229,23 @@ class Container(object):
        any internal caches.
        '''
        removed = set()
-        for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
-                                   namespaces={'opf':OPF2_NS}):
-            if self.href_to_name(elem.get('href')) == name:
+        for elem in self.opf_xpath('//opf:manifest/opf:item[@href]'):
+            if self.href_to_name(elem.get('href'), self.opf_name) == name:
                id_ = elem.get('id', None)
                if id_ is not None:
                    removed.add(id_)
-                elem.getparent().remove(elem)
+                self.remove_from_xml(elem)
                self.dirty(self.opf_name)
        if removed:
-            for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
-                                    namespaces={'opf':OPF2_NS}):
+            for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
                idref = item.get('idref')
                if idref in removed:
-                    item.getparent().remove(item)
+                    self.remove_from_xml(item)
                    self.dirty(self.opf_name)

-        for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
-                                    namespaces={'opf':OPF2_NS}):
-            if self.href_to_name(item.get('href')) == name:
-                item.getparent().remove(item)
+        for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
+            if self.href_to_name(item.get('href'), self.opf_name) == name:
+                self.remove_from_xml(item)
                self.dirty(self.opf_name)

        path = self.name_path_map.pop(name)
@ -230,6 +258,76 @@ class Container(object):
    def dirty(self, name):
        self.dirtied.add(name)

+    def remove_from_xml(self, item):
+        'Removes item from parent, fixing indentation (works only with self closing items)'
+        parent = item.getparent()
+        idx = parent.index(item)
+        if idx == 0:
+            # We are removing the first item - only care about adjusting
+            # the tail if this was the only child
+            if len(parent) == 1:
+                parent.text = item.tail
+        else:
+            # Make sure the preceding item has this tail
+            parent[idx-1].tail = item.tail
+        parent.remove(item)
+        return item
+
+    def insert_into_xml(self, parent, item, index=None):
+        '''Insert item into parent (or append if index is None), fixing
+        indentation. Only works with self closing items.'''
+        if index is None:
+            parent.append(item)
+        else:
+            parent.insert(index, item)
+        idx = parent.index(item)
+        if idx == 0:
+            item.tail = parent.text
+            # If this is the only child of this parent element, we need a
+            # little extra work as we have gone from a self-closing <foo />
+            # element to <foo><item /></foo>
+            if len(parent) == 1:
+                sibling = parent.getprevious()
+                if sibling is None:
+                    # Give up!
+                    return
+                parent.text = sibling.text
+                item.tail = sibling.tail
+        else:
+            item.tail = parent[idx-1].tail
+            if idx == len(parent)-1:
+                parent[idx-1].tail = parent.text
+
+    def generate_item(self, name, id_prefix=None, media_type=None):
+        '''Add an item to the manifest with href derived from the given
+        name. Ensures uniqueness of href and id automatically. Returns
+        generated item.'''
+        id_prefix = id_prefix or 'id'
+        media_type = media_type or guess_type(name)[0]
+        path = self.name_to_abspath(name)
+        relpath = self.relpath(path, base=self.opf_dir)
+        href = urlquote(relpath)
+        base, ext = href.rpartition('.')[0::2]
+        all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
+        c = 0
+        item_id = id_prefix
+        while item_id in all_ids:
+            c += 1
+            item_id = id_prefix + '%d'%c
+        all_names = {x.get('href') for x in self.opf_xpath(
+                '//opf:manifest/opf:item[@href]')}
+        c = 0
+        while href in all_names:
+            c += 1
+            href = '%s_%d.%s'%(base, c, ext)
+        manifest = self.opf_xpath('//opf:manifest')[0]
+        item = manifest.makeelement(OPF('item'), nsmap=OPF_NAMESPACES,
+                                    id=item_id, href=href)
+        item.set('media-type', media_type)
+        self.insert_into_xml(manifest, item)
+        self.dirty(self.opf_name)
+        return item
+
    def commit(self, outpath=None):
        for name in tuple(self.dirtied):
            self.dirtied.remove(name)
@ -257,6 +355,8 @@ OCF_NS = 'urn:oasis:names:tc:opendocument:xmlns:container'

 class EpubContainer(Container):

+    book_type = 'epub'
+
    META_INF = {
            'container.xml' : True,
            'manifest.xml' : False,
@ -314,7 +414,7 @@ class EpubContainer(Container):
            if alg not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
                raise DRMError()
            cr = em.getparent().xpath('descendant::*[local-name()="CipherReference" and @URI]')[0]
-            name = self.href_to_name(cr.get('URI'), self.root)
+            name = self.href_to_name(cr.get('URI'))
            path = self.name_path_map.get(name, None)
            if path is not None:
                fonts[name] = alg
@ -327,14 +427,14 @@ class EpubContainer(Container):
                package_id = val
                break
        if package_id is not None:
-            for elem in self.opf.xpath('//*[@id=%r]'%package_id):
+            for elem in self.opf_xpath('//*[@id=%r]'%package_id):
                if elem.text:
                    unique_identifier = elem.text.rpartition(':')[-1]
                    break
        if unique_identifier is not None:
            idpf_key = hashlib.sha1(unique_identifier).digest()
        key = None
-        for item in self.opf.xpath('//*[local-name()="metadata"]/*'
+        for item in self.opf_xpath('//*[local-name()="metadata"]/*'
                                   '[local-name()="identifier"]'):
            scheme = None
            for xkey in item.attrib.keys():
@ -397,6 +497,8 @@ def do_explode(path, dest):

 class AZW3Container(Container):

+    book_type = 'azw3'
+
    def __init__(self, pathtoazw3, log):
        self.pathtoazw3 = pathtoazw3
        tdir = self.root = PersistentTemporaryDirectory('_azw3_container')
--- a/src/calibre/ebooks/oeb/polish/cover.py
+++ b/src/calibre/ebooks/oeb/polish/cover.py
@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import shutil
+
+from calibre.ebooks.oeb.base import OPF
+
+def set_azw3_cover(container, cover_path, report):
+    name = None
+    found = True
+    for gi in container.opf_xpath('//opf:guide/opf:reference[@href and contains(@type, "cover")]'):
+        href = gi.get('href')
+        name = container.href_to_name(href, container.opf_name)
+        container.remove_from_xml(gi)
+    if name is None or not container.has_name(name):
+        item = container.generate_item(name='cover.jpeg', id_prefix='cover')
+        name = container.href_to_name(item.get('href'), container.opf_name)
+        found = False
+    href = container.name_to_href(name, container.opf_name)
+    guide = container.opf_xpath('//opf:guide')[0]
+    container.insert_into_xml(guide, guide.makeelement(
+        OPF('reference'), href=href, type='cover'))
+    shutil.copyfile(cover_path, container.name_to_abspath(name))
+    container.dirty(container.opf_name)
+    report('Cover updated' if found else 'Cover inserted')
+
+def set_cover(container, cover_path, report):
+    if container.book_type == 'azw3':
+        set_azw3_cover(container, cover_path, report)
+
--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@ -14,6 +14,7 @@ from functools import partial
 from calibre.ebooks.oeb.polish.container import get_container
 from calibre.ebooks.oeb.polish.stats import StatsCollector
 from calibre.ebooks.oeb.polish.subset import subset_all_fonts
+from calibre.ebooks.oeb.polish.cover import set_cover
 from calibre.utils.logging import Log

 ALL_OPTS = {
@ -72,6 +73,7 @@ CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in HELP.iteritems()}
 # }}}

 def polish(file_map, opts, log, report):
+    rt = lambda x: report('\n### ' + x)
    for inbook, outbook in file_map.iteritems():
        report('Polishing: %s'%(inbook.rpartition('.')[-1].upper()))
        ebook = get_container(inbook, log)
@ -80,10 +82,15 @@ def polish(file_map, opts, log, report):
            stats = StatsCollector(ebook)

        if opts.subset:
-            report('\n### Subsetting embedded fonts')
+            rt('Subsetting embedded fonts')
            subset_all_fonts(ebook, stats.font_stats, report)
            report('')

+        if opts.cover:
+            rt('Setting cover')
+            set_cover(ebook, opts.cover, report)
+            report('')
+
        ebook.commit(outbook)

 def gui_polish(data):
@ -105,8 +112,12 @@ def option_parser():
    USAGE = '%prog [options] input_file [output_file]\n\n' + re.sub(
        r'<.*?>', '', CLI_HELP['about'])
    parser = OptionParser(usage=USAGE)
-    o = partial(parser.add_option, default=False, action='store_true')
+    a = parser.add_option
+    o = partial(a, default=False, action='store_true')
    o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
+    a('--cover', help=_(
+        'Path to a cover image. Changes the cover specified in the ebook. '
+        'If no cover is present, inserts a new cover.'))
    o('--verbose', help=_('Produce more verbose output, useful for debugging.'))

    return parser
@ -139,7 +150,7 @@ def main():
    report = []
    something = False
    for name in ALL_OPTS:
-        if name not in {'opf', 'cover'}:
+        if name not in {'opf', }:
            if getattr(popts, name):
                something = True

--- a/src/calibre/ebooks/oeb/polish/subset.py
+++ b/src/calibre/ebooks/oeb/polish/subset.py
@ -15,7 +15,7 @@ from calibre.ebooks.oeb.polish.container import OEB_FONTS
 from calibre.utils.fonts.sfnt.subset import subset
 from calibre.utils.fonts.utils import get_font_names

-def remove_font_face_rules(container, sheet, remove_names):
+def remove_font_face_rules(container, sheet, remove_names, base):
    changed = False
    for rule in tuple(sheet.cssRules):
        if rule.type != rule.FONT_FACE_RULE:
@ -24,7 +24,7 @@ def remove_font_face_rules(container, sheet, remove_names):
            uri = rule.style.getProperty('src').propertyValue[0].uri
        except (IndexError, KeyError, AttributeError, TypeError, ValueError):
            continue
-        name = container.href_to_name(uri)
+        name = container.href_to_name(uri, base)
        if name in remove_names:
            sheet.deleteRule(rule)
            changed = True
@ -65,13 +65,13 @@ def subset_all_fonts(container, font_stats, report):
        for name, mt in container.mime_map.iteritems():
            if mt in OEB_STYLES:
                sheet = container.parsed(name)
-                if remove_font_face_rules(container, sheet, remove):
+                if remove_font_face_rules(container, sheet, remove, name):
                    container.dirty(name)
            elif mt in OEB_DOCS:
                for style in XPath('//h:style')(container.parsed(name)):
                    if style.get('type', 'text/css') == 'text/css' and style.text:
                        sheet = container.parse_css(style.text, name)
-                        if remove_font_face_rules(container, sheet, remove):
+                        if remove_font_face_rules(container, sheet, remove, name):
                            style.text = sheet.cssText
                            container.dirty(name)
    if total_old > 0: