ebook-polish: Roundtripping for both epub and azw3.

This commit is contained in:
Kovid Goyal 2013-02-03 22:36:31 +05:30
parent d661b15ae2
commit 09429e3c1c
2 changed files with 130 additions and 26 deletions

View File

@ -337,6 +337,24 @@ def xml2unicode(root, pretty_print=False):
def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
def serialize(data, media_type, pretty_print=False):
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=pretty_print)
if media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return bytes(data)
ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -960,23 +978,7 @@ class Manifest(object):
self._data = None
def __str__(self):
data = self.data
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=self.oeb.pretty_print)
if self.media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return str(data)
return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
def __unicode__(self):
data = self.data

View File

@ -13,17 +13,20 @@ from urllib import unquote as urlunquote
from lxml import etree
from calibre import guess_type, CurrentDir
from calibre.customize.ui import (plugin_for_input_format,
plugin_for_output_format)
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.plugins.epub_input import (
ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font)
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.oeb.base import OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS
from calibre.ebooks.mobi.tweak import set_cover
from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
OEB_STYLES, OPF2_NS)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile
@ -43,6 +46,7 @@ class Container(object):
self.parsed_cache = {}
self.mime_map = {}
self.name_path_map = {}
self.dirtied = set()
# Map of relative paths with '/' separators from root of unzipped ePub
# to absolute paths on filesystem with os-specific separators
@ -141,8 +145,6 @@ class Container(object):
data = self.parse_xml(data)
elif mime in OEB_STYLES:
data = self.parse_css(data, self.relpath(path))
elif mime in OEB_FONTS or path.rpartition('.')[-1].lower() in {'ttf', 'otf'}:
data = Sfnt(data)
return data
def parse_css(self, data, fname):
@ -189,6 +191,64 @@ class Container(object):
for path in non_linear:
yield path
def remove_item(self, name):
'''
Remove the item identified by name from this container. This remove all
references to the item in the OPF manifest, guide and spine as well as from
any internal caches.
'''
removed = set()
for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(elem.get('href')) == name:
id_ = elem.get('id', None)
if id_ is not None:
removed.add(id_)
elem.getparent().remove(elem)
if removed:
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
namespaces={'opf':OPF2_NS}):
idref = item.get('idref')
if idref in removed:
item.getparent().remove(item)
for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(item.get('href')) == name:
item.getparent().remove(item)
path = self.name_path_map.pop(name)
if os.path.exists(path):
os.remove(path)
self.mime_map.pop(name, None)
self.parsed_cache.pop(name, None)
self.dirtied.discard(name)
def dirty(self, name):
self.dirtied.add(name)
def commit(self, outpath=None):
for name in self.dirtied:
self.dirtied.remove(name)
data = self.parsed_cache.pop(name)
data = serialize(data)
with open(self.name_path_map[name], 'wb') as f:
f.write(data)
def compare_to(self, other):
if set(self.name_path_map) != set(other.name_path_map):
return ['Set of files is not the same']
mismatches = []
for name, path in self.name_path_map.iteritems():
opath = other.name_path_map[name]
with open(path, 'rb') as f1, open(opath, 'rb') as f2:
if f1.read() != f2.read():
mismatches.append('The file %s is not the same'%name)
import subprocess
subprocess.call(['kompare', path, opath])
return '\n'.join(mismatches)
# EPUB {{{
class InvalidEpub(InvalidBook):
pass
@ -294,8 +354,24 @@ class EpubContainer(Container):
if not tkey:
raise InvalidBook('Failed to find obfuscation key')
decrypt_font(tkey, path, alg)
self.obfuscated_fonts[name] = (alg, tkey)
self.obfuscated_fonts[font] = (alg, tkey)
def commit(self, outpath=None):
super(EpubContainer, self).commit()
for name in self.obfuscated_fonts:
if name not in self.name_path_map:
continue
alg, key = self.obfuscated_fonts[name]
# Decrypting and encrypting are the same operation (XOR with key)
decrypt_font(key, self.name_path_map[name], alg)
if outpath is None:
outpath = self.pathtoepub
from calibre.ebooks.tweak import zip_rebuilder
zip_rebuilder(self.root, outpath)
# }}}
# AZW3 {{{
class InvalidMobi(InvalidBook):
pass
@ -357,14 +433,40 @@ class AZW3Container(Container):
super(AZW3Container, self).__init__(tdir, opf_path, log)
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
def commit(self, outpath=None):
super(AZW3Container, self).commit()
if outpath is None:
outpath = self.pathtoazw3
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
opf = self.name_path_map[self.opf_name]
plumber = Plumber(opf, outpath, self.log)
plumber.setup_options()
inp = plugin_for_input_format('azw3')
outp = plugin_for_output_format('azw3')
plumber.opts.mobi_passthrough = True
oeb = create_oebbook(default_log, opf, plumber.opts)
set_cover(oeb)
outp.convert(oeb, outpath, inp, plumber.opts, default_log)
# }}}
def get_container(path, log=None):
if log is None: log = default_log
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(path, log)
return ebook
if __name__ == '__main__':
def test_roundtrip():
ebook = get_container(sys.argv[-1])
for s in ebook.spine_items:
print (ebook.relpath(s))
p = PersistentTemporaryFile(suffix='.'+sys.argv[-1].rpartition('.')[-1])
p.close()
ebook.commit(outpath=p.name)
ebook2 = get_container(p.name)
ebook3 = get_container(p.name)
diff = ebook3.compare_to(ebook2)
if diff is not None:
print (diff)
if __name__ == '__main__':
test_roundtrip()