ebook-polish: Roundtripping for both epub and azw3.

This commit is contained in:
Kovid Goyal 2013-02-03 22:36:31 +05:30
parent d661b15ae2
commit 09429e3c1c
2 changed files with 130 additions and 26 deletions

View File

@ -337,6 +337,24 @@ def xml2unicode(root, pretty_print=False):
def xml2text(elem): def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False) return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
def serialize(data, media_type, pretty_print=False):
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=pretty_print)
if media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return bytes(data)
ASCII_CHARS = set(chr(x) for x in xrange(128)) ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -960,23 +978,7 @@ class Manifest(object):
self._data = None self._data = None
def __str__(self): def __str__(self):
data = self.data return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=self.oeb.pretty_print)
if self.media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return str(data)
def __unicode__(self): def __unicode__(self):
data = self.data data = self.data

View File

@ -13,17 +13,20 @@ from urllib import unquote as urlunquote
from lxml import etree from lxml import etree
from calibre import guess_type, CurrentDir from calibre import guess_type, CurrentDir
from calibre.customize.ui import (plugin_for_input_format,
plugin_for_output_format)
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.plugins.epub_input import ( from calibre.ebooks.conversion.plugins.epub_input import (
ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font) ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font)
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.oeb.base import OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS from calibre.ebooks.mobi.tweak import set_cover
from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
OEB_STYLES, OPF2_NS)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.utils.ipc.simple_worker import fork_job, WorkerError from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -43,6 +46,7 @@ class Container(object):
self.parsed_cache = {} self.parsed_cache = {}
self.mime_map = {} self.mime_map = {}
self.name_path_map = {} self.name_path_map = {}
self.dirtied = set()
# Map of relative paths with '/' separators from root of unzipped ePub # Map of relative paths with '/' separators from root of unzipped ePub
# to absolute paths on filesystem with os-specific separators # to absolute paths on filesystem with os-specific separators
@ -141,8 +145,6 @@ class Container(object):
data = self.parse_xml(data) data = self.parse_xml(data)
elif mime in OEB_STYLES: elif mime in OEB_STYLES:
data = self.parse_css(data, self.relpath(path)) data = self.parse_css(data, self.relpath(path))
elif mime in OEB_FONTS or path.rpartition('.')[-1].lower() in {'ttf', 'otf'}:
data = Sfnt(data)
return data return data
def parse_css(self, data, fname): def parse_css(self, data, fname):
@ -189,6 +191,64 @@ class Container(object):
for path in non_linear: for path in non_linear:
yield path yield path
def remove_item(self, name):
'''
Remove the item identified by name from this container. This remove all
references to the item in the OPF manifest, guide and spine as well as from
any internal caches.
'''
removed = set()
for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(elem.get('href')) == name:
id_ = elem.get('id', None)
if id_ is not None:
removed.add(id_)
elem.getparent().remove(elem)
if removed:
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
namespaces={'opf':OPF2_NS}):
idref = item.get('idref')
if idref in removed:
item.getparent().remove(item)
for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(item.get('href')) == name:
item.getparent().remove(item)
path = self.name_path_map.pop(name)
if os.path.exists(path):
os.remove(path)
self.mime_map.pop(name, None)
self.parsed_cache.pop(name, None)
self.dirtied.discard(name)
def dirty(self, name):
self.dirtied.add(name)
def commit(self, outpath=None):
for name in self.dirtied:
self.dirtied.remove(name)
data = self.parsed_cache.pop(name)
data = serialize(data)
with open(self.name_path_map[name], 'wb') as f:
f.write(data)
def compare_to(self, other):
if set(self.name_path_map) != set(other.name_path_map):
return ['Set of files is not the same']
mismatches = []
for name, path in self.name_path_map.iteritems():
opath = other.name_path_map[name]
with open(path, 'rb') as f1, open(opath, 'rb') as f2:
if f1.read() != f2.read():
mismatches.append('The file %s is not the same'%name)
import subprocess
subprocess.call(['kompare', path, opath])
return '\n'.join(mismatches)
# EPUB {{{
class InvalidEpub(InvalidBook): class InvalidEpub(InvalidBook):
pass pass
@ -294,8 +354,24 @@ class EpubContainer(Container):
if not tkey: if not tkey:
raise InvalidBook('Failed to find obfuscation key') raise InvalidBook('Failed to find obfuscation key')
decrypt_font(tkey, path, alg) decrypt_font(tkey, path, alg)
self.obfuscated_fonts[name] = (alg, tkey) self.obfuscated_fonts[font] = (alg, tkey)
def commit(self, outpath=None):
super(EpubContainer, self).commit()
for name in self.obfuscated_fonts:
if name not in self.name_path_map:
continue
alg, key = self.obfuscated_fonts[name]
# Decrypting and encrypting are the same operation (XOR with key)
decrypt_font(key, self.name_path_map[name], alg)
if outpath is None:
outpath = self.pathtoepub
from calibre.ebooks.tweak import zip_rebuilder
zip_rebuilder(self.root, outpath)
# }}}
# AZW3 {{{
class InvalidMobi(InvalidBook): class InvalidMobi(InvalidBook):
pass pass
@ -357,14 +433,40 @@ class AZW3Container(Container):
super(AZW3Container, self).__init__(tdir, opf_path, log) super(AZW3Container, self).__init__(tdir, opf_path, log)
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts} self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
def commit(self, outpath=None):
super(AZW3Container, self).commit()
if outpath is None:
outpath = self.pathtoazw3
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
opf = self.name_path_map[self.opf_name]
plumber = Plumber(opf, outpath, self.log)
plumber.setup_options()
inp = plugin_for_input_format('azw3')
outp = plugin_for_output_format('azw3')
plumber.opts.mobi_passthrough = True
oeb = create_oebbook(default_log, opf, plumber.opts)
set_cover(oeb)
outp.convert(oeb, outpath, inp, plumber.opts, default_log)
# }}}
def get_container(path, log=None): def get_container(path, log=None):
if log is None: log = default_log if log is None: log = default_log
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'} ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(path, log) else EpubContainer)(path, log)
return ebook return ebook
if __name__ == '__main__': def test_roundtrip():
ebook = get_container(sys.argv[-1]) ebook = get_container(sys.argv[-1])
for s in ebook.spine_items: p = PersistentTemporaryFile(suffix='.'+sys.argv[-1].rpartition('.')[-1])
print (ebook.relpath(s)) p.close()
ebook.commit(outpath=p.name)
ebook2 = get_container(p.name)
ebook3 = get_container(p.name)
diff = ebook3.compare_to(ebook2)
if diff is not None:
print (diff)
if __name__ == '__main__':
test_roundtrip()