diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index e6e499236d..bd6a23e871 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -337,6 +337,24 @@ def xml2unicode(root, pretty_print=False): def xml2text(elem): return etree.tostring(elem, method='text', encoding=unicode, with_tail=False) +def serialize(data, media_type, pretty_print=False): + if isinstance(data, etree._Element): + ans = xml2str(data, pretty_print=pretty_print) + if media_type in OEB_DOCS: + # Convert self closing div|span|a|video|audio|iframe|etc tags + # to normally closed ones, as they are interpreted + # incorrectly by some browser based renderers + ans = close_self_closing_tags(ans) + return ans + if isinstance(data, unicode): + return data.encode('utf-8') + if hasattr(data, 'cssText'): + data = data.cssText + if isinstance(data, unicode): + data = data.encode('utf-8') + return data + b'\n' + return bytes(data) + ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -960,23 +978,7 @@ class Manifest(object): self._data = None def __str__(self): - data = self.data - if isinstance(data, etree._Element): - ans = xml2str(data, pretty_print=self.oeb.pretty_print) - if self.media_type in OEB_DOCS: - # Convert self closing div|span|a|video|audio|iframe|etc tags - # to normally closed ones, as they are interpreted - # incorrectly by some browser based renderers - ans = close_self_closing_tags(ans) - return ans - if isinstance(data, unicode): - return data.encode('utf-8') - if hasattr(data, 'cssText'): - data = data.cssText - if isinstance(data, unicode): - data = data.encode('utf-8') - return data + b'\n' - return str(data) + return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print) def __unicode__(self): data = self.data diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 2ddf6223ac..01b2f62ab2 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -13,17 +13,20 @@ from urllib import unquote as urlunquote from lxml import etree from calibre import guess_type, CurrentDir +from calibre.customize.ui import (plugin_for_input_format, + plugin_for_output_format) from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.conversion.plugins.epub_input import ( ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font) from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.reader.headers import MetadataHeader -from calibre.ebooks.oeb.base import OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS +from calibre.ebooks.mobi.tweak import set_cover +from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger, + OEB_STYLES, OPF2_NS) from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER -from calibre.ptempfile import PersistentTemporaryDirectory -from calibre.utils.fonts.sfnt.container import Sfnt +from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.ipc.simple_worker import fork_job, WorkerError from calibre.utils.logging import default_log from calibre.utils.zipfile import ZipFile @@ -43,6 +46,7 @@ class Container(object): self.parsed_cache = {} self.mime_map = {} self.name_path_map = {} + self.dirtied = set() # Map of relative paths with '/' separators from root of unzipped ePub # to absolute paths on filesystem with os-specific separators @@ -141,8 +145,6 @@ class Container(object): data = self.parse_xml(data) elif mime in OEB_STYLES: data = self.parse_css(data, self.relpath(path)) - elif mime in OEB_FONTS or path.rpartition('.')[-1].lower() in {'ttf', 'otf'}: - data = Sfnt(data) return data def parse_css(self, data, fname): @@ -189,6 +191,64 @@ class Container(object): for path in non_linear: yield path + def remove_item(self, name): + ''' + Remove the item identified by name from this container. This remove all + references to the item in the OPF manifest, guide and spine as well as from + any internal caches. + ''' + removed = set() + for elem in self.opf.xpath('//opf:manifest/opf:item[@href]', + namespaces={'opf':OPF2_NS}): + if self.href_to_name(elem.get('href')) == name: + id_ = elem.get('id', None) + if id_ is not None: + removed.add(id_) + elem.getparent().remove(elem) + if removed: + for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]', + namespaces={'opf':OPF2_NS}): + idref = item.get('idref') + if idref in removed: + item.getparent().remove(item) + + for item in self.opf.xpath('//opf:guide/opf:reference[@href]', + namespaces={'opf':OPF2_NS}): + if self.href_to_name(item.get('href')) == name: + item.getparent().remove(item) + + path = self.name_path_map.pop(name) + if os.path.exists(path): + os.remove(path) + self.mime_map.pop(name, None) + self.parsed_cache.pop(name, None) + self.dirtied.discard(name) + + def dirty(self, name): + self.dirtied.add(name) + + def commit(self, outpath=None): + for name in self.dirtied: + self.dirtied.remove(name) + data = self.parsed_cache.pop(name) + data = serialize(data) + with open(self.name_path_map[name], 'wb') as f: + f.write(data) + + def compare_to(self, other): + if set(self.name_path_map) != set(other.name_path_map): + return ['Set of files is not the same'] + mismatches = [] + for name, path in self.name_path_map.iteritems(): + opath = other.name_path_map[name] + with open(path, 'rb') as f1, open(opath, 'rb') as f2: + if f1.read() != f2.read(): + mismatches.append('The file %s is not the same'%name) + import subprocess + subprocess.call(['kompare', path, opath]) + return '\n'.join(mismatches) + +# EPUB {{{ class InvalidEpub(InvalidBook): pass @@ -294,8 +354,24 @@ class EpubContainer(Container): if not tkey: raise InvalidBook('Failed to find obfuscation key') decrypt_font(tkey, path, alg) - self.obfuscated_fonts[name] = (alg, tkey) + self.obfuscated_fonts[font] = (alg, tkey) + def commit(self, outpath=None): + super(EpubContainer, self).commit() + for name in self.obfuscated_fonts: + if name not in self.name_path_map: + continue + alg, key = self.obfuscated_fonts[name] + # Decrypting and encrypting are the same operation (XOR with key) + decrypt_font(key, self.name_path_map[name], alg) + if outpath is None: + outpath = self.pathtoepub + from calibre.ebooks.tweak import zip_rebuilder + zip_rebuilder(self.root, outpath) + +# }}} + +# AZW3 {{{ class InvalidMobi(InvalidBook): pass @@ -357,14 +433,40 @@ class AZW3Container(Container): super(AZW3Container, self).__init__(tdir, opf_path, log) self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts} + def commit(self, outpath=None): + super(AZW3Container, self).commit() + if outpath is None: + outpath = self.pathtoazw3 + from calibre.ebooks.conversion.plumber import Plumber, create_oebbook + opf = self.name_path_map[self.opf_name] + plumber = Plumber(opf, outpath, self.log) + plumber.setup_options() + inp = plugin_for_input_format('azw3') + outp = plugin_for_output_format('azw3') + plumber.opts.mobi_passthrough = True + oeb = create_oebbook(default_log, opf, plumber.opts) + set_cover(oeb) + outp.convert(oeb, outpath, inp, plumber.opts, default_log) +# }}} + def get_container(path, log=None): if log is None: log = default_log ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'} else EpubContainer)(path, log) return ebook -if __name__ == '__main__': +def test_roundtrip(): ebook = get_container(sys.argv[-1]) - for s in ebook.spine_items: - print (ebook.relpath(s)) + p = PersistentTemporaryFile(suffix='.'+sys.argv[-1].rpartition('.')[-1]) + p.close() + ebook.commit(outpath=p.name) + ebook2 = get_container(p.name) + ebook3 = get_container(p.name) + diff = ebook3.compare_to(ebook2) + if diff is not None: + print (diff) + +if __name__ == '__main__': + test_roundtrip() +