From 6aa1b67d8820c54dd43bf327a8c64077104d78d2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 10 Feb 2013 10:05:02 +0530 Subject: [PATCH] ebook-polish: Implementing cover setting in azw3 and fix various bugs in the container class --- src/calibre/ebooks/oeb/polish/container.py | 172 ++++++++++++++++----- src/calibre/ebooks/oeb/polish/cover.py | 36 +++++ src/calibre/ebooks/oeb/polish/main.py | 17 +- src/calibre/ebooks/oeb/polish/subset.py | 8 +- 4 files changed, 191 insertions(+), 42 deletions(-) create mode 100644 src/calibre/ebooks/oeb/polish/cover.py diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index d2b6ac85b5..08fae838c1 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal ' __docformat__ = 'restructuredtext en' import os, logging, sys, hashlib, uuid -from urllib import unquote as urlunquote +from urllib import unquote as urlunquote, quote as urlquote from lxml import etree @@ -22,8 +22,8 @@ from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcess from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.tweak import set_cover -from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger, - OEB_STYLES, OPF2_NS) +from calibre.ebooks.oeb.base import ( + serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF) from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile @@ -34,9 +34,25 @@ from calibre.utils.zipfile import ZipFile exists, join, relpath = os.path.exists, os.path.join, os.path.relpath OEB_FONTS = {guess_type('a.ttf')[0], guess_type('b.ttf')[0]} +OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS} class Container(object): + ''' + A container represents an Open EBook as a directory full of files and an + opf file. There are two important concepts: + + * The root directory. This is the base of the ebook. All the ebooks + files are inside this directory or in its sub-directories. + + * Names: These are paths to the books' files relative to the root + directory. They always contain POSIX separators and are unquoted. They + can be thought of as canonical identifiers for files in the book. + Most methods on the container object work with names. + ''' + + book_type = 'oeb' + def __init__(self, rootpath, opfpath, log): self.root = os.path.abspath(rootpath) self.log = log @@ -54,7 +70,7 @@ class Container(object): for dirpath, _dirnames, filenames in os.walk(self.root): for f in filenames: path = join(dirpath, f) - name = relpath(path, self.root).replace(os.sep, '/') + name = self.abspath_to_name(path) self.name_path_map[name] = path self.mime_map[name] = guess_type(path)[0] # Special case if we have stumbled onto the opf @@ -63,35 +79,52 @@ class Container(object): self.opf_dir = os.path.dirname(path) self.mime_map[name] = guess_type('a.opf')[0] + if not hasattr(self, 'opf_name'): + raise InvalidBook('Book has no OPF file') + # Update mime map with data from the OPF - for item in self.opf.xpath( - '//opf:manifest/opf:item[@href and @media-type]', - namespaces={'opf':OPF2_NS}): + for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'): href = item.get('href') - self.mime_map[self.href_to_name(href)] = item.get('media-type') + self.mime_map[self.href_to_name(href, self.opf_name)] = item.get('media-type') def abspath_to_name(self, fullpath): return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/') + def name_to_abspath(self, name): + return os.path.abspath(join(self.root, *name.split('/'))) + def href_to_name(self, href, base=None): ''' - Convert an href (relative to base) to a name (i.e. a path - relative to self.root with POSIX separators). - - base must be an absolute path with OS separators or None, in which case - the href is interpreted relative to the dir containing the OPF. + Convert an href (relative to base) to a name. base must be a name or + None, in which self.root is used. ''' if base is None: - base = self.opf_dir + base = self.root + else: + base = os.path.dirname(self.name_to_abspath(base)) href = urlunquote(href.partition('#')[0]) fullpath = os.path.join(base, *href.split('/')) return self.abspath_to_name(fullpath) + def name_to_href(self, name, base=None): + '''Convert a name to a href relative to base, which must be a name or + None in which case self.root is used as the base''' + fullpath = self.name_to_abspath(name) + basepath = self.root if base is None else os.path.dirname(self.name_to_abspath(base)) + path = relpath(fullpath, basepath).replace(os.sep, '/') + return urlquote(path) + + def opf_xpath(self, expr): + return self.opf.xpath(expr, namespaces=OPF_NAMESPACES) + def has_name(self, name): return name in self.name_path_map - def relpath(self, path): - return relpath(path, self.root) + def relpath(self, path, base=None): + '''Convert an absolute path (with os separators) to a path relative to + base (defaults to self.root). The relative path is *not* a name. Use + abspath_to_name() for that.''' + return relpath(path, base or self.root) def decode(self, data): """Automatically decode :param:`data` into a `unicode` object.""" @@ -173,13 +206,11 @@ class Container(object): @property def spine_items(self): - manifest_id_map = {item.get('id'):self.href_to_name(item.get('href')) - for item in self.opf.xpath('//opf:manifest/opf:item[@href and @id]', - namespaces={'opf':OPF2_NS})} + manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name) + for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')} linear, non_linear = [], [] - for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]', - namespaces={'opf':OPF2_NS}): + for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'): idref = item.get('idref') name = manifest_id_map.get(idref, None) path = self.name_path_map.get(name, None) @@ -198,26 +229,23 @@ class Container(object): any internal caches. ''' removed = set() - for elem in self.opf.xpath('//opf:manifest/opf:item[@href]', - namespaces={'opf':OPF2_NS}): - if self.href_to_name(elem.get('href')) == name: + for elem in self.opf_xpath('//opf:manifest/opf:item[@href]'): + if self.href_to_name(elem.get('href'), self.opf_name) == name: id_ = elem.get('id', None) if id_ is not None: removed.add(id_) - elem.getparent().remove(elem) + self.remove_from_xml(elem) self.dirty(self.opf_name) if removed: - for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]', - namespaces={'opf':OPF2_NS}): + for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'): idref = item.get('idref') if idref in removed: - item.getparent().remove(item) + self.remove_from_xml(item) self.dirty(self.opf_name) - for item in self.opf.xpath('//opf:guide/opf:reference[@href]', - namespaces={'opf':OPF2_NS}): - if self.href_to_name(item.get('href')) == name: - item.getparent().remove(item) + for item in self.opf_xpath('//opf:guide/opf:reference[@href]'): + if self.href_to_name(item.get('href'), self.opf_name) == name: + self.remove_from_xml(item) self.dirty(self.opf_name) path = self.name_path_map.pop(name) @@ -230,6 +258,76 @@ class Container(object): def dirty(self, name): self.dirtied.add(name) + def remove_from_xml(self, item): + 'Removes item from parent, fixing indentation (works only with self closing items)' + parent = item.getparent() + idx = parent.index(item) + if idx == 0: + # We are removing the first item - only care about adjusting + # the tail if this was the only child + if len(parent) == 1: + parent.text = item.tail + else: + # Make sure the preceding item has this tail + parent[idx-1].tail = item.tail + parent.remove(item) + return item + + def insert_into_xml(self, parent, item, index=None): + '''Insert item into parent (or append if index is None), fixing + indentation. Only works with self closing items.''' + if index is None: + parent.append(item) + else: + parent.insert(index, item) + idx = parent.index(item) + if idx == 0: + item.tail = parent.text + # If this is the only child of this parent element, we need a + # little extra work as we have gone from a self-closing + # element to + if len(parent) == 1: + sibling = parent.getprevious() + if sibling is None: + # Give up! + return + parent.text = sibling.text + item.tail = sibling.tail + else: + item.tail = parent[idx-1].tail + if idx == len(parent)-1: + parent[idx-1].tail = parent.text + + def generate_item(self, name, id_prefix=None, media_type=None): + '''Add an item to the manifest with href derived from the given + name. Ensures uniqueness of href and id automatically. Returns + generated item.''' + id_prefix = id_prefix or 'id' + media_type = media_type or guess_type(name)[0] + path = self.name_to_abspath(name) + relpath = self.relpath(path, base=self.opf_dir) + href = urlquote(relpath) + base, ext = href.rpartition('.')[0::2] + all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')} + c = 0 + item_id = id_prefix + while item_id in all_ids: + c += 1 + item_id = id_prefix + '%d'%c + all_names = {x.get('href') for x in self.opf_xpath( + '//opf:manifest/opf:item[@href]')} + c = 0 + while href in all_names: + c += 1 + href = '%s_%d.%s'%(base, c, ext) + manifest = self.opf_xpath('//opf:manifest')[0] + item = manifest.makeelement(OPF('item'), nsmap=OPF_NAMESPACES, + id=item_id, href=href) + item.set('media-type', media_type) + self.insert_into_xml(manifest, item) + self.dirty(self.opf_name) + return item + def commit(self, outpath=None): for name in tuple(self.dirtied): self.dirtied.remove(name) @@ -257,6 +355,8 @@ OCF_NS = 'urn:oasis:names:tc:opendocument:xmlns:container' class EpubContainer(Container): + book_type = 'epub' + META_INF = { 'container.xml' : True, 'manifest.xml' : False, @@ -314,7 +414,7 @@ class EpubContainer(Container): if alg not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: raise DRMError() cr = em.getparent().xpath('descendant::*[local-name()="CipherReference" and @URI]')[0] - name = self.href_to_name(cr.get('URI'), self.root) + name = self.href_to_name(cr.get('URI')) path = self.name_path_map.get(name, None) if path is not None: fonts[name] = alg @@ -327,14 +427,14 @@ class EpubContainer(Container): package_id = val break if package_id is not None: - for elem in self.opf.xpath('//*[@id=%r]'%package_id): + for elem in self.opf_xpath('//*[@id=%r]'%package_id): if elem.text: unique_identifier = elem.text.rpartition(':')[-1] break if unique_identifier is not None: idpf_key = hashlib.sha1(unique_identifier).digest() key = None - for item in self.opf.xpath('//*[local-name()="metadata"]/*' + for item in self.opf_xpath('//*[local-name()="metadata"]/*' '[local-name()="identifier"]'): scheme = None for xkey in item.attrib.keys(): @@ -397,6 +497,8 @@ def do_explode(path, dest): class AZW3Container(Container): + book_type = 'azw3' + def __init__(self, pathtoazw3, log): self.pathtoazw3 = pathtoazw3 tdir = self.root = PersistentTemporaryDirectory('_azw3_container') diff --git a/src/calibre/ebooks/oeb/polish/cover.py b/src/calibre/ebooks/oeb/polish/cover.py new file mode 100644 index 0000000000..2ad0e2bdfd --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/cover.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import shutil + +from calibre.ebooks.oeb.base import OPF + +def set_azw3_cover(container, cover_path, report): + name = None + found = True + for gi in container.opf_xpath('//opf:guide/opf:reference[@href and contains(@type, "cover")]'): + href = gi.get('href') + name = container.href_to_name(href, container.opf_name) + container.remove_from_xml(gi) + if name is None or not container.has_name(name): + item = container.generate_item(name='cover.jpeg', id_prefix='cover') + name = container.href_to_name(item.get('href'), container.opf_name) + found = False + href = container.name_to_href(name, container.opf_name) + guide = container.opf_xpath('//opf:guide')[0] + container.insert_into_xml(guide, guide.makeelement( + OPF('reference'), href=href, type='cover')) + shutil.copyfile(cover_path, container.name_to_abspath(name)) + container.dirty(container.opf_name) + report('Cover updated' if found else 'Cover inserted') + +def set_cover(container, cover_path, report): + if container.book_type == 'azw3': + set_azw3_cover(container, cover_path, report) + diff --git a/src/calibre/ebooks/oeb/polish/main.py b/src/calibre/ebooks/oeb/polish/main.py index 57cf570ed4..3473caacdb 100644 --- a/src/calibre/ebooks/oeb/polish/main.py +++ b/src/calibre/ebooks/oeb/polish/main.py @@ -14,6 +14,7 @@ from functools import partial from calibre.ebooks.oeb.polish.container import get_container from calibre.ebooks.oeb.polish.stats import StatsCollector from calibre.ebooks.oeb.polish.subset import subset_all_fonts +from calibre.ebooks.oeb.polish.cover import set_cover from calibre.utils.logging import Log ALL_OPTS = { @@ -72,6 +73,7 @@ CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in HELP.iteritems()} # }}} def polish(file_map, opts, log, report): + rt = lambda x: report('\n### ' + x) for inbook, outbook in file_map.iteritems(): report('Polishing: %s'%(inbook.rpartition('.')[-1].upper())) ebook = get_container(inbook, log) @@ -80,10 +82,15 @@ def polish(file_map, opts, log, report): stats = StatsCollector(ebook) if opts.subset: - report('\n### Subsetting embedded fonts') + rt('Subsetting embedded fonts') subset_all_fonts(ebook, stats.font_stats, report) report('') + if opts.cover: + rt('Setting cover') + set_cover(ebook, opts.cover, report) + report('') + ebook.commit(outbook) def gui_polish(data): @@ -105,8 +112,12 @@ def option_parser(): USAGE = '%prog [options] input_file [output_file]\n\n' + re.sub( r'<.*?>', '', CLI_HELP['about']) parser = OptionParser(usage=USAGE) - o = partial(parser.add_option, default=False, action='store_true') + a = parser.add_option + o = partial(a, default=False, action='store_true') o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset']) + a('--cover', help=_( + 'Path to a cover image. Changes the cover specified in the ebook. ' + 'If no cover is present, inserts a new cover.')) o('--verbose', help=_('Produce more verbose output, useful for debugging.')) return parser @@ -139,7 +150,7 @@ def main(): report = [] something = False for name in ALL_OPTS: - if name not in {'opf', 'cover'}: + if name not in {'opf', }: if getattr(popts, name): something = True diff --git a/src/calibre/ebooks/oeb/polish/subset.py b/src/calibre/ebooks/oeb/polish/subset.py index eb4f4c4b1f..5ab9db7f5c 100644 --- a/src/calibre/ebooks/oeb/polish/subset.py +++ b/src/calibre/ebooks/oeb/polish/subset.py @@ -15,7 +15,7 @@ from calibre.ebooks.oeb.polish.container import OEB_FONTS from calibre.utils.fonts.sfnt.subset import subset from calibre.utils.fonts.utils import get_font_names -def remove_font_face_rules(container, sheet, remove_names): +def remove_font_face_rules(container, sheet, remove_names, base): changed = False for rule in tuple(sheet.cssRules): if rule.type != rule.FONT_FACE_RULE: @@ -24,7 +24,7 @@ def remove_font_face_rules(container, sheet, remove_names): uri = rule.style.getProperty('src').propertyValue[0].uri except (IndexError, KeyError, AttributeError, TypeError, ValueError): continue - name = container.href_to_name(uri) + name = container.href_to_name(uri, base) if name in remove_names: sheet.deleteRule(rule) changed = True @@ -65,13 +65,13 @@ def subset_all_fonts(container, font_stats, report): for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES: sheet = container.parsed(name) - if remove_font_face_rules(container, sheet, remove): + if remove_font_face_rules(container, sheet, remove, name): container.dirty(name) elif mt in OEB_DOCS: for style in XPath('//h:style')(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text, name) - if remove_font_face_rules(container, sheet, remove): + if remove_font_face_rules(container, sheet, remove, name): style.text = sheet.cssText container.dirty(name) if total_old > 0: