ebook-polish: Implementing cover setting in azw3 and fix various bugs in the container class

This commit is contained in:
Kovid Goyal 2013-02-10 10:05:02 +05:30
parent 789f4ab01a
commit 6aa1b67d88
4 changed files with 191 additions and 42 deletions

View File

@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, logging, sys, hashlib, uuid
from urllib import unquote as urlunquote
from urllib import unquote as urlunquote, quote as urlquote
from lxml import etree
@ -22,8 +22,8 @@ from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcess
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.mobi.tweak import set_cover
from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
OEB_STYLES, OPF2_NS)
from calibre.ebooks.oeb.base import (
serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@ -34,9 +34,25 @@ from calibre.utils.zipfile import ZipFile
exists, join, relpath = os.path.exists, os.path.join, os.path.relpath
OEB_FONTS = {guess_type('a.ttf')[0], guess_type('b.ttf')[0]}
OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS}
class Container(object):
'''
A container represents an Open EBook as a directory full of files and an
opf file. There are two important concepts:
* The root directory. This is the base of the ebook. All the ebooks
files are inside this directory or in its sub-directories.
* Names: These are paths to the books' files relative to the root
directory. They always contain POSIX separators and are unquoted. They
can be thought of as canonical identifiers for files in the book.
Most methods on the container object work with names.
'''
book_type = 'oeb'
def __init__(self, rootpath, opfpath, log):
self.root = os.path.abspath(rootpath)
self.log = log
@ -54,7 +70,7 @@ class Container(object):
for dirpath, _dirnames, filenames in os.walk(self.root):
for f in filenames:
path = join(dirpath, f)
name = relpath(path, self.root).replace(os.sep, '/')
name = self.abspath_to_name(path)
self.name_path_map[name] = path
self.mime_map[name] = guess_type(path)[0]
# Special case if we have stumbled onto the opf
@ -63,35 +79,52 @@ class Container(object):
self.opf_dir = os.path.dirname(path)
self.mime_map[name] = guess_type('a.opf')[0]
if not hasattr(self, 'opf_name'):
raise InvalidBook('Book has no OPF file')
# Update mime map with data from the OPF
for item in self.opf.xpath(
'//opf:manifest/opf:item[@href and @media-type]',
namespaces={'opf':OPF2_NS}):
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
href = item.get('href')
self.mime_map[self.href_to_name(href)] = item.get('media-type')
self.mime_map[self.href_to_name(href, self.opf_name)] = item.get('media-type')
def abspath_to_name(self, fullpath):
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
def name_to_abspath(self, name):
return os.path.abspath(join(self.root, *name.split('/')))
def href_to_name(self, href, base=None):
'''
Convert an href (relative to base) to a name (i.e. a path
relative to self.root with POSIX separators).
base must be an absolute path with OS separators or None, in which case
the href is interpreted relative to the dir containing the OPF.
Convert an href (relative to base) to a name. base must be a name or
None, in which self.root is used.
'''
if base is None:
base = self.opf_dir
base = self.root
else:
base = os.path.dirname(self.name_to_abspath(base))
href = urlunquote(href.partition('#')[0])
fullpath = os.path.join(base, *href.split('/'))
return self.abspath_to_name(fullpath)
def name_to_href(self, name, base=None):
'''Convert a name to a href relative to base, which must be a name or
None in which case self.root is used as the base'''
fullpath = self.name_to_abspath(name)
basepath = self.root if base is None else os.path.dirname(self.name_to_abspath(base))
path = relpath(fullpath, basepath).replace(os.sep, '/')
return urlquote(path)
def opf_xpath(self, expr):
return self.opf.xpath(expr, namespaces=OPF_NAMESPACES)
def has_name(self, name):
return name in self.name_path_map
def relpath(self, path):
return relpath(path, self.root)
def relpath(self, path, base=None):
'''Convert an absolute path (with os separators) to a path relative to
base (defaults to self.root). The relative path is *not* a name. Use
abspath_to_name() for that.'''
return relpath(path, base or self.root)
def decode(self, data):
"""Automatically decode :param:`data` into a `unicode` object."""
@ -173,13 +206,11 @@ class Container(object):
@property
def spine_items(self):
manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'))
for item in self.opf.xpath('//opf:manifest/opf:item[@href and @id]',
namespaces={'opf':OPF2_NS})}
manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}
linear, non_linear = [], []
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
namespaces={'opf':OPF2_NS}):
for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
idref = item.get('idref')
name = manifest_id_map.get(idref, None)
path = self.name_path_map.get(name, None)
@ -198,26 +229,23 @@ class Container(object):
any internal caches.
'''
removed = set()
for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(elem.get('href')) == name:
for elem in self.opf_xpath('//opf:manifest/opf:item[@href]'):
if self.href_to_name(elem.get('href'), self.opf_name) == name:
id_ = elem.get('id', None)
if id_ is not None:
removed.add(id_)
elem.getparent().remove(elem)
self.remove_from_xml(elem)
self.dirty(self.opf_name)
if removed:
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
namespaces={'opf':OPF2_NS}):
for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
idref = item.get('idref')
if idref in removed:
item.getparent().remove(item)
self.remove_from_xml(item)
self.dirty(self.opf_name)
for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(item.get('href')) == name:
item.getparent().remove(item)
for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
if self.href_to_name(item.get('href'), self.opf_name) == name:
self.remove_from_xml(item)
self.dirty(self.opf_name)
path = self.name_path_map.pop(name)
@ -230,6 +258,76 @@ class Container(object):
def dirty(self, name):
self.dirtied.add(name)
def remove_from_xml(self, item):
'Removes item from parent, fixing indentation (works only with self closing items)'
parent = item.getparent()
idx = parent.index(item)
if idx == 0:
# We are removing the first item - only care about adjusting
# the tail if this was the only child
if len(parent) == 1:
parent.text = item.tail
else:
# Make sure the preceding item has this tail
parent[idx-1].tail = item.tail
parent.remove(item)
return item
def insert_into_xml(self, parent, item, index=None):
'''Insert item into parent (or append if index is None), fixing
indentation. Only works with self closing items.'''
if index is None:
parent.append(item)
else:
parent.insert(index, item)
idx = parent.index(item)
if idx == 0:
item.tail = parent.text
# If this is the only child of this parent element, we need a
# little extra work as we have gone from a self-closing <foo />
# element to <foo><item /></foo>
if len(parent) == 1:
sibling = parent.getprevious()
if sibling is None:
# Give up!
return
parent.text = sibling.text
item.tail = sibling.tail
else:
item.tail = parent[idx-1].tail
if idx == len(parent)-1:
parent[idx-1].tail = parent.text
def generate_item(self, name, id_prefix=None, media_type=None):
'''Add an item to the manifest with href derived from the given
name. Ensures uniqueness of href and id automatically. Returns
generated item.'''
id_prefix = id_prefix or 'id'
media_type = media_type or guess_type(name)[0]
path = self.name_to_abspath(name)
relpath = self.relpath(path, base=self.opf_dir)
href = urlquote(relpath)
base, ext = href.rpartition('.')[0::2]
all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
c = 0
item_id = id_prefix
while item_id in all_ids:
c += 1
item_id = id_prefix + '%d'%c
all_names = {x.get('href') for x in self.opf_xpath(
'//opf:manifest/opf:item[@href]')}
c = 0
while href in all_names:
c += 1
href = '%s_%d.%s'%(base, c, ext)
manifest = self.opf_xpath('//opf:manifest')[0]
item = manifest.makeelement(OPF('item'), nsmap=OPF_NAMESPACES,
id=item_id, href=href)
item.set('media-type', media_type)
self.insert_into_xml(manifest, item)
self.dirty(self.opf_name)
return item
def commit(self, outpath=None):
for name in tuple(self.dirtied):
self.dirtied.remove(name)
@ -257,6 +355,8 @@ OCF_NS = 'urn:oasis:names:tc:opendocument:xmlns:container'
class EpubContainer(Container):
book_type = 'epub'
META_INF = {
'container.xml' : True,
'manifest.xml' : False,
@ -314,7 +414,7 @@ class EpubContainer(Container):
if alg not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
raise DRMError()
cr = em.getparent().xpath('descendant::*[local-name()="CipherReference" and @URI]')[0]
name = self.href_to_name(cr.get('URI'), self.root)
name = self.href_to_name(cr.get('URI'))
path = self.name_path_map.get(name, None)
if path is not None:
fonts[name] = alg
@ -327,14 +427,14 @@ class EpubContainer(Container):
package_id = val
break
if package_id is not None:
for elem in self.opf.xpath('//*[@id=%r]'%package_id):
for elem in self.opf_xpath('//*[@id=%r]'%package_id):
if elem.text:
unique_identifier = elem.text.rpartition(':')[-1]
break
if unique_identifier is not None:
idpf_key = hashlib.sha1(unique_identifier).digest()
key = None
for item in self.opf.xpath('//*[local-name()="metadata"]/*'
for item in self.opf_xpath('//*[local-name()="metadata"]/*'
'[local-name()="identifier"]'):
scheme = None
for xkey in item.attrib.keys():
@ -397,6 +497,8 @@ def do_explode(path, dest):
class AZW3Container(Container):
book_type = 'azw3'
def __init__(self, pathtoazw3, log):
self.pathtoazw3 = pathtoazw3
tdir = self.root = PersistentTemporaryDirectory('_azw3_container')

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import shutil
from calibre.ebooks.oeb.base import OPF
def set_azw3_cover(container, cover_path, report):
name = None
found = True
for gi in container.opf_xpath('//opf:guide/opf:reference[@href and contains(@type, "cover")]'):
href = gi.get('href')
name = container.href_to_name(href, container.opf_name)
container.remove_from_xml(gi)
if name is None or not container.has_name(name):
item = container.generate_item(name='cover.jpeg', id_prefix='cover')
name = container.href_to_name(item.get('href'), container.opf_name)
found = False
href = container.name_to_href(name, container.opf_name)
guide = container.opf_xpath('//opf:guide')[0]
container.insert_into_xml(guide, guide.makeelement(
OPF('reference'), href=href, type='cover'))
shutil.copyfile(cover_path, container.name_to_abspath(name))
container.dirty(container.opf_name)
report('Cover updated' if found else 'Cover inserted')
def set_cover(container, cover_path, report):
if container.book_type == 'azw3':
set_azw3_cover(container, cover_path, report)

View File

@ -14,6 +14,7 @@ from functools import partial
from calibre.ebooks.oeb.polish.container import get_container
from calibre.ebooks.oeb.polish.stats import StatsCollector
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
from calibre.ebooks.oeb.polish.cover import set_cover
from calibre.utils.logging import Log
ALL_OPTS = {
@ -72,6 +73,7 @@ CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in HELP.iteritems()}
# }}}
def polish(file_map, opts, log, report):
rt = lambda x: report('\n### ' + x)
for inbook, outbook in file_map.iteritems():
report('Polishing: %s'%(inbook.rpartition('.')[-1].upper()))
ebook = get_container(inbook, log)
@ -80,10 +82,15 @@ def polish(file_map, opts, log, report):
stats = StatsCollector(ebook)
if opts.subset:
report('\n### Subsetting embedded fonts')
rt('Subsetting embedded fonts')
subset_all_fonts(ebook, stats.font_stats, report)
report('')
if opts.cover:
rt('Setting cover')
set_cover(ebook, opts.cover, report)
report('')
ebook.commit(outbook)
def gui_polish(data):
@ -105,8 +112,12 @@ def option_parser():
USAGE = '%prog [options] input_file [output_file]\n\n' + re.sub(
r'<.*?>', '', CLI_HELP['about'])
parser = OptionParser(usage=USAGE)
o = partial(parser.add_option, default=False, action='store_true')
a = parser.add_option
o = partial(a, default=False, action='store_true')
o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
a('--cover', help=_(
'Path to a cover image. Changes the cover specified in the ebook. '
'If no cover is present, inserts a new cover.'))
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
return parser
@ -139,7 +150,7 @@ def main():
report = []
something = False
for name in ALL_OPTS:
if name not in {'opf', 'cover'}:
if name not in {'opf', }:
if getattr(popts, name):
something = True

View File

@ -15,7 +15,7 @@ from calibre.ebooks.oeb.polish.container import OEB_FONTS
from calibre.utils.fonts.sfnt.subset import subset
from calibre.utils.fonts.utils import get_font_names
def remove_font_face_rules(container, sheet, remove_names):
def remove_font_face_rules(container, sheet, remove_names, base):
changed = False
for rule in tuple(sheet.cssRules):
if rule.type != rule.FONT_FACE_RULE:
@ -24,7 +24,7 @@ def remove_font_face_rules(container, sheet, remove_names):
uri = rule.style.getProperty('src').propertyValue[0].uri
except (IndexError, KeyError, AttributeError, TypeError, ValueError):
continue
name = container.href_to_name(uri)
name = container.href_to_name(uri, base)
if name in remove_names:
sheet.deleteRule(rule)
changed = True
@ -65,13 +65,13 @@ def subset_all_fonts(container, font_stats, report):
for name, mt in container.mime_map.iteritems():
if mt in OEB_STYLES:
sheet = container.parsed(name)
if remove_font_face_rules(container, sheet, remove):
if remove_font_face_rules(container, sheet, remove, name):
container.dirty(name)
elif mt in OEB_DOCS:
for style in XPath('//h:style')(container.parsed(name)):
if style.get('type', 'text/css') == 'text/css' and style.text:
sheet = container.parse_css(style.text, name)
if remove_font_face_rules(container, sheet, remove):
if remove_font_face_rules(container, sheet, remove, name):
style.text = sheet.cssText
container.dirty(name)
if total_old > 0: