mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
ebook-polish: Implementing cover setting in azw3 and fix various bugs in the container class
This commit is contained in:
parent
789f4ab01a
commit
6aa1b67d88
@ -8,7 +8,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, logging, sys, hashlib, uuid
|
||||
from urllib import unquote as urlunquote
|
||||
from urllib import unquote as urlunquote, quote as urlquote
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@ -22,8 +22,8 @@ from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcess
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.mobi.reader.headers import MetadataHeader
|
||||
from calibre.ebooks.mobi.tweak import set_cover
|
||||
from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
|
||||
OEB_STYLES, OPF2_NS)
|
||||
from calibre.ebooks.oeb.base import (
|
||||
serialize, OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS, DC11_NS, OPF)
|
||||
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
||||
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||
@ -34,9 +34,25 @@ from calibre.utils.zipfile import ZipFile
|
||||
exists, join, relpath = os.path.exists, os.path.join, os.path.relpath
|
||||
|
||||
OEB_FONTS = {guess_type('a.ttf')[0], guess_type('b.ttf')[0]}
|
||||
OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS}
|
||||
|
||||
class Container(object):
|
||||
|
||||
'''
|
||||
A container represents an Open EBook as a directory full of files and an
|
||||
opf file. There are two important concepts:
|
||||
|
||||
* The root directory. This is the base of the ebook. All the ebooks
|
||||
files are inside this directory or in its sub-directories.
|
||||
|
||||
* Names: These are paths to the books' files relative to the root
|
||||
directory. They always contain POSIX separators and are unquoted. They
|
||||
can be thought of as canonical identifiers for files in the book.
|
||||
Most methods on the container object work with names.
|
||||
'''
|
||||
|
||||
book_type = 'oeb'
|
||||
|
||||
def __init__(self, rootpath, opfpath, log):
|
||||
self.root = os.path.abspath(rootpath)
|
||||
self.log = log
|
||||
@ -54,7 +70,7 @@ class Container(object):
|
||||
for dirpath, _dirnames, filenames in os.walk(self.root):
|
||||
for f in filenames:
|
||||
path = join(dirpath, f)
|
||||
name = relpath(path, self.root).replace(os.sep, '/')
|
||||
name = self.abspath_to_name(path)
|
||||
self.name_path_map[name] = path
|
||||
self.mime_map[name] = guess_type(path)[0]
|
||||
# Special case if we have stumbled onto the opf
|
||||
@ -63,35 +79,52 @@ class Container(object):
|
||||
self.opf_dir = os.path.dirname(path)
|
||||
self.mime_map[name] = guess_type('a.opf')[0]
|
||||
|
||||
if not hasattr(self, 'opf_name'):
|
||||
raise InvalidBook('Book has no OPF file')
|
||||
|
||||
# Update mime map with data from the OPF
|
||||
for item in self.opf.xpath(
|
||||
'//opf:manifest/opf:item[@href and @media-type]',
|
||||
namespaces={'opf':OPF2_NS}):
|
||||
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
|
||||
href = item.get('href')
|
||||
self.mime_map[self.href_to_name(href)] = item.get('media-type')
|
||||
self.mime_map[self.href_to_name(href, self.opf_name)] = item.get('media-type')
|
||||
|
||||
def abspath_to_name(self, fullpath):
|
||||
return self.relpath(os.path.abspath(fullpath)).replace(os.sep, '/')
|
||||
|
||||
def name_to_abspath(self, name):
|
||||
return os.path.abspath(join(self.root, *name.split('/')))
|
||||
|
||||
def href_to_name(self, href, base=None):
|
||||
'''
|
||||
Convert an href (relative to base) to a name (i.e. a path
|
||||
relative to self.root with POSIX separators).
|
||||
|
||||
base must be an absolute path with OS separators or None, in which case
|
||||
the href is interpreted relative to the dir containing the OPF.
|
||||
Convert an href (relative to base) to a name. base must be a name or
|
||||
None, in which self.root is used.
|
||||
'''
|
||||
if base is None:
|
||||
base = self.opf_dir
|
||||
base = self.root
|
||||
else:
|
||||
base = os.path.dirname(self.name_to_abspath(base))
|
||||
href = urlunquote(href.partition('#')[0])
|
||||
fullpath = os.path.join(base, *href.split('/'))
|
||||
return self.abspath_to_name(fullpath)
|
||||
|
||||
def name_to_href(self, name, base=None):
|
||||
'''Convert a name to a href relative to base, which must be a name or
|
||||
None in which case self.root is used as the base'''
|
||||
fullpath = self.name_to_abspath(name)
|
||||
basepath = self.root if base is None else os.path.dirname(self.name_to_abspath(base))
|
||||
path = relpath(fullpath, basepath).replace(os.sep, '/')
|
||||
return urlquote(path)
|
||||
|
||||
def opf_xpath(self, expr):
|
||||
return self.opf.xpath(expr, namespaces=OPF_NAMESPACES)
|
||||
|
||||
def has_name(self, name):
|
||||
return name in self.name_path_map
|
||||
|
||||
def relpath(self, path):
|
||||
return relpath(path, self.root)
|
||||
def relpath(self, path, base=None):
|
||||
'''Convert an absolute path (with os separators) to a path relative to
|
||||
base (defaults to self.root). The relative path is *not* a name. Use
|
||||
abspath_to_name() for that.'''
|
||||
return relpath(path, base or self.root)
|
||||
|
||||
def decode(self, data):
|
||||
"""Automatically decode :param:`data` into a `unicode` object."""
|
||||
@ -173,13 +206,11 @@ class Container(object):
|
||||
|
||||
@property
|
||||
def spine_items(self):
|
||||
manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'))
|
||||
for item in self.opf.xpath('//opf:manifest/opf:item[@href and @id]',
|
||||
namespaces={'opf':OPF2_NS})}
|
||||
manifest_id_map = {item.get('id'):self.href_to_name(item.get('href'), self.opf_name)
|
||||
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @id]')}
|
||||
|
||||
linear, non_linear = [], []
|
||||
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
|
||||
namespaces={'opf':OPF2_NS}):
|
||||
for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
|
||||
idref = item.get('idref')
|
||||
name = manifest_id_map.get(idref, None)
|
||||
path = self.name_path_map.get(name, None)
|
||||
@ -198,26 +229,23 @@ class Container(object):
|
||||
any internal caches.
|
||||
'''
|
||||
removed = set()
|
||||
for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
|
||||
namespaces={'opf':OPF2_NS}):
|
||||
if self.href_to_name(elem.get('href')) == name:
|
||||
for elem in self.opf_xpath('//opf:manifest/opf:item[@href]'):
|
||||
if self.href_to_name(elem.get('href'), self.opf_name) == name:
|
||||
id_ = elem.get('id', None)
|
||||
if id_ is not None:
|
||||
removed.add(id_)
|
||||
elem.getparent().remove(elem)
|
||||
self.remove_from_xml(elem)
|
||||
self.dirty(self.opf_name)
|
||||
if removed:
|
||||
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
|
||||
namespaces={'opf':OPF2_NS}):
|
||||
for item in self.opf_xpath('//opf:spine/opf:itemref[@idref]'):
|
||||
idref = item.get('idref')
|
||||
if idref in removed:
|
||||
item.getparent().remove(item)
|
||||
self.remove_from_xml(item)
|
||||
self.dirty(self.opf_name)
|
||||
|
||||
for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
|
||||
namespaces={'opf':OPF2_NS}):
|
||||
if self.href_to_name(item.get('href')) == name:
|
||||
item.getparent().remove(item)
|
||||
for item in self.opf_xpath('//opf:guide/opf:reference[@href]'):
|
||||
if self.href_to_name(item.get('href'), self.opf_name) == name:
|
||||
self.remove_from_xml(item)
|
||||
self.dirty(self.opf_name)
|
||||
|
||||
path = self.name_path_map.pop(name)
|
||||
@ -230,6 +258,76 @@ class Container(object):
|
||||
def dirty(self, name):
|
||||
self.dirtied.add(name)
|
||||
|
||||
def remove_from_xml(self, item):
|
||||
'Removes item from parent, fixing indentation (works only with self closing items)'
|
||||
parent = item.getparent()
|
||||
idx = parent.index(item)
|
||||
if idx == 0:
|
||||
# We are removing the first item - only care about adjusting
|
||||
# the tail if this was the only child
|
||||
if len(parent) == 1:
|
||||
parent.text = item.tail
|
||||
else:
|
||||
# Make sure the preceding item has this tail
|
||||
parent[idx-1].tail = item.tail
|
||||
parent.remove(item)
|
||||
return item
|
||||
|
||||
def insert_into_xml(self, parent, item, index=None):
|
||||
'''Insert item into parent (or append if index is None), fixing
|
||||
indentation. Only works with self closing items.'''
|
||||
if index is None:
|
||||
parent.append(item)
|
||||
else:
|
||||
parent.insert(index, item)
|
||||
idx = parent.index(item)
|
||||
if idx == 0:
|
||||
item.tail = parent.text
|
||||
# If this is the only child of this parent element, we need a
|
||||
# little extra work as we have gone from a self-closing <foo />
|
||||
# element to <foo><item /></foo>
|
||||
if len(parent) == 1:
|
||||
sibling = parent.getprevious()
|
||||
if sibling is None:
|
||||
# Give up!
|
||||
return
|
||||
parent.text = sibling.text
|
||||
item.tail = sibling.tail
|
||||
else:
|
||||
item.tail = parent[idx-1].tail
|
||||
if idx == len(parent)-1:
|
||||
parent[idx-1].tail = parent.text
|
||||
|
||||
def generate_item(self, name, id_prefix=None, media_type=None):
|
||||
'''Add an item to the manifest with href derived from the given
|
||||
name. Ensures uniqueness of href and id automatically. Returns
|
||||
generated item.'''
|
||||
id_prefix = id_prefix or 'id'
|
||||
media_type = media_type or guess_type(name)[0]
|
||||
path = self.name_to_abspath(name)
|
||||
relpath = self.relpath(path, base=self.opf_dir)
|
||||
href = urlquote(relpath)
|
||||
base, ext = href.rpartition('.')[0::2]
|
||||
all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
|
||||
c = 0
|
||||
item_id = id_prefix
|
||||
while item_id in all_ids:
|
||||
c += 1
|
||||
item_id = id_prefix + '%d'%c
|
||||
all_names = {x.get('href') for x in self.opf_xpath(
|
||||
'//opf:manifest/opf:item[@href]')}
|
||||
c = 0
|
||||
while href in all_names:
|
||||
c += 1
|
||||
href = '%s_%d.%s'%(base, c, ext)
|
||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||
item = manifest.makeelement(OPF('item'), nsmap=OPF_NAMESPACES,
|
||||
id=item_id, href=href)
|
||||
item.set('media-type', media_type)
|
||||
self.insert_into_xml(manifest, item)
|
||||
self.dirty(self.opf_name)
|
||||
return item
|
||||
|
||||
def commit(self, outpath=None):
|
||||
for name in tuple(self.dirtied):
|
||||
self.dirtied.remove(name)
|
||||
@ -257,6 +355,8 @@ OCF_NS = 'urn:oasis:names:tc:opendocument:xmlns:container'
|
||||
|
||||
class EpubContainer(Container):
|
||||
|
||||
book_type = 'epub'
|
||||
|
||||
META_INF = {
|
||||
'container.xml' : True,
|
||||
'manifest.xml' : False,
|
||||
@ -314,7 +414,7 @@ class EpubContainer(Container):
|
||||
if alg not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
|
||||
raise DRMError()
|
||||
cr = em.getparent().xpath('descendant::*[local-name()="CipherReference" and @URI]')[0]
|
||||
name = self.href_to_name(cr.get('URI'), self.root)
|
||||
name = self.href_to_name(cr.get('URI'))
|
||||
path = self.name_path_map.get(name, None)
|
||||
if path is not None:
|
||||
fonts[name] = alg
|
||||
@ -327,14 +427,14 @@ class EpubContainer(Container):
|
||||
package_id = val
|
||||
break
|
||||
if package_id is not None:
|
||||
for elem in self.opf.xpath('//*[@id=%r]'%package_id):
|
||||
for elem in self.opf_xpath('//*[@id=%r]'%package_id):
|
||||
if elem.text:
|
||||
unique_identifier = elem.text.rpartition(':')[-1]
|
||||
break
|
||||
if unique_identifier is not None:
|
||||
idpf_key = hashlib.sha1(unique_identifier).digest()
|
||||
key = None
|
||||
for item in self.opf.xpath('//*[local-name()="metadata"]/*'
|
||||
for item in self.opf_xpath('//*[local-name()="metadata"]/*'
|
||||
'[local-name()="identifier"]'):
|
||||
scheme = None
|
||||
for xkey in item.attrib.keys():
|
||||
@ -397,6 +497,8 @@ def do_explode(path, dest):
|
||||
|
||||
class AZW3Container(Container):
|
||||
|
||||
book_type = 'azw3'
|
||||
|
||||
def __init__(self, pathtoazw3, log):
|
||||
self.pathtoazw3 = pathtoazw3
|
||||
tdir = self.root = PersistentTemporaryDirectory('_azw3_container')
|
||||
|
36
src/calibre/ebooks/oeb/polish/cover.py
Normal file
36
src/calibre/ebooks/oeb/polish/cover.py
Normal file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import shutil
|
||||
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
|
||||
def set_azw3_cover(container, cover_path, report):
|
||||
name = None
|
||||
found = True
|
||||
for gi in container.opf_xpath('//opf:guide/opf:reference[@href and contains(@type, "cover")]'):
|
||||
href = gi.get('href')
|
||||
name = container.href_to_name(href, container.opf_name)
|
||||
container.remove_from_xml(gi)
|
||||
if name is None or not container.has_name(name):
|
||||
item = container.generate_item(name='cover.jpeg', id_prefix='cover')
|
||||
name = container.href_to_name(item.get('href'), container.opf_name)
|
||||
found = False
|
||||
href = container.name_to_href(name, container.opf_name)
|
||||
guide = container.opf_xpath('//opf:guide')[0]
|
||||
container.insert_into_xml(guide, guide.makeelement(
|
||||
OPF('reference'), href=href, type='cover'))
|
||||
shutil.copyfile(cover_path, container.name_to_abspath(name))
|
||||
container.dirty(container.opf_name)
|
||||
report('Cover updated' if found else 'Cover inserted')
|
||||
|
||||
def set_cover(container, cover_path, report):
|
||||
if container.book_type == 'azw3':
|
||||
set_azw3_cover(container, cover_path, report)
|
||||
|
@ -14,6 +14,7 @@ from functools import partial
|
||||
from calibre.ebooks.oeb.polish.container import get_container
|
||||
from calibre.ebooks.oeb.polish.stats import StatsCollector
|
||||
from calibre.ebooks.oeb.polish.subset import subset_all_fonts
|
||||
from calibre.ebooks.oeb.polish.cover import set_cover
|
||||
from calibre.utils.logging import Log
|
||||
|
||||
ALL_OPTS = {
|
||||
@ -72,6 +73,7 @@ CLI_HELP = {x:hfix(x, re.sub('<.*?>', '', y)) for x, y in HELP.iteritems()}
|
||||
# }}}
|
||||
|
||||
def polish(file_map, opts, log, report):
|
||||
rt = lambda x: report('\n### ' + x)
|
||||
for inbook, outbook in file_map.iteritems():
|
||||
report('Polishing: %s'%(inbook.rpartition('.')[-1].upper()))
|
||||
ebook = get_container(inbook, log)
|
||||
@ -80,10 +82,15 @@ def polish(file_map, opts, log, report):
|
||||
stats = StatsCollector(ebook)
|
||||
|
||||
if opts.subset:
|
||||
report('\n### Subsetting embedded fonts')
|
||||
rt('Subsetting embedded fonts')
|
||||
subset_all_fonts(ebook, stats.font_stats, report)
|
||||
report('')
|
||||
|
||||
if opts.cover:
|
||||
rt('Setting cover')
|
||||
set_cover(ebook, opts.cover, report)
|
||||
report('')
|
||||
|
||||
ebook.commit(outbook)
|
||||
|
||||
def gui_polish(data):
|
||||
@ -105,8 +112,12 @@ def option_parser():
|
||||
USAGE = '%prog [options] input_file [output_file]\n\n' + re.sub(
|
||||
r'<.*?>', '', CLI_HELP['about'])
|
||||
parser = OptionParser(usage=USAGE)
|
||||
o = partial(parser.add_option, default=False, action='store_true')
|
||||
a = parser.add_option
|
||||
o = partial(a, default=False, action='store_true')
|
||||
o('--subset-fonts', '-f', dest='subset', help=CLI_HELP['subset'])
|
||||
a('--cover', help=_(
|
||||
'Path to a cover image. Changes the cover specified in the ebook. '
|
||||
'If no cover is present, inserts a new cover.'))
|
||||
o('--verbose', help=_('Produce more verbose output, useful for debugging.'))
|
||||
|
||||
return parser
|
||||
@ -139,7 +150,7 @@ def main():
|
||||
report = []
|
||||
something = False
|
||||
for name in ALL_OPTS:
|
||||
if name not in {'opf', 'cover'}:
|
||||
if name not in {'opf', }:
|
||||
if getattr(popts, name):
|
||||
something = True
|
||||
|
||||
|
@ -15,7 +15,7 @@ from calibre.ebooks.oeb.polish.container import OEB_FONTS
|
||||
from calibre.utils.fonts.sfnt.subset import subset
|
||||
from calibre.utils.fonts.utils import get_font_names
|
||||
|
||||
def remove_font_face_rules(container, sheet, remove_names):
|
||||
def remove_font_face_rules(container, sheet, remove_names, base):
|
||||
changed = False
|
||||
for rule in tuple(sheet.cssRules):
|
||||
if rule.type != rule.FONT_FACE_RULE:
|
||||
@ -24,7 +24,7 @@ def remove_font_face_rules(container, sheet, remove_names):
|
||||
uri = rule.style.getProperty('src').propertyValue[0].uri
|
||||
except (IndexError, KeyError, AttributeError, TypeError, ValueError):
|
||||
continue
|
||||
name = container.href_to_name(uri)
|
||||
name = container.href_to_name(uri, base)
|
||||
if name in remove_names:
|
||||
sheet.deleteRule(rule)
|
||||
changed = True
|
||||
@ -65,13 +65,13 @@ def subset_all_fonts(container, font_stats, report):
|
||||
for name, mt in container.mime_map.iteritems():
|
||||
if mt in OEB_STYLES:
|
||||
sheet = container.parsed(name)
|
||||
if remove_font_face_rules(container, sheet, remove):
|
||||
if remove_font_face_rules(container, sheet, remove, name):
|
||||
container.dirty(name)
|
||||
elif mt in OEB_DOCS:
|
||||
for style in XPath('//h:style')(container.parsed(name)):
|
||||
if style.get('type', 'text/css') == 'text/css' and style.text:
|
||||
sheet = container.parse_css(style.text, name)
|
||||
if remove_font_face_rules(container, sheet, remove):
|
||||
if remove_font_face_rules(container, sheet, remove, name):
|
||||
style.text = sheet.cssText
|
||||
container.dirty(name)
|
||||
if total_old > 0:
|
||||
|
Loading…
x
Reference in New Issue
Block a user