ebook-polish: Roundtripping for both epub and azw3.

2025-07-09 03:04:10 -04:00 · 2013-02-03 22:36:31 +05:30 · 2013-02-03 22:36:31 +05:30 · 09429e3c1c
commit 09429e3c1c
parent d661b15ae2
2 changed files with 130 additions and 26 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -337,6 +337,24 @@ def xml2unicode(root, pretty_print=False):
 def xml2text(elem):
    return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
 def serialize(data, media_type, pretty_print=False):
    if isinstance(data, etree._Element):
        ans = xml2str(data, pretty_print=pretty_print)
        if media_type in OEB_DOCS:
            # Convert self closing div|span|a|video|audio|iframe|etc tags
            # to normally closed ones, as they are interpreted
            # incorrectly by some browser based renderers
            ans = close_self_closing_tags(ans)
        return ans
    if isinstance(data, unicode):
        return data.encode('utf-8')
    if hasattr(data, 'cssText'):
        data = data.cssText
        if isinstance(data, unicode):
            data = data.encode('utf-8')
        return data + b'\n'
    return bytes(data)
 ASCII_CHARS   = set(chr(x) for x in xrange(128))
 UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
 URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -960,23 +978,7 @@ class Manifest(object):
                self._data = None
        def __str__(self):
-            data = self.data
+            return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
            if isinstance(data, etree._Element):
                ans = xml2str(data, pretty_print=self.oeb.pretty_print)
                if self.media_type in OEB_DOCS:
                    # Convert self closing div|span|a|video|audio|iframe|etc tags
                    # to normally closed ones, as they are interpreted
                    # incorrectly by some browser based renderers
                    ans = close_self_closing_tags(ans)
                return ans
            if isinstance(data, unicode):
                return data.encode('utf-8')
            if hasattr(data, 'cssText'):
                data = data.cssText
                if isinstance(data, unicode):
                    data = data.encode('utf-8')
                return data + b'\n'
            return str(data)
        def __unicode__(self):
            data = self.data
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -13,17 +13,20 @@ from urllib import unquote as urlunquote
 from lxml import etree
 from calibre import guess_type, CurrentDir
 from calibre.customize.ui import (plugin_for_input_format,
        plugin_for_output_format)
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.conversion.plugins.epub_input import (
    ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font)
 from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.reader.headers import MetadataHeader
-from calibre.ebooks.oeb.base import OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS
+from calibre.ebooks.mobi.tweak import set_cover
 from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
                                     OEB_STYLES, OPF2_NS)
 from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
 from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
-from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
 from calibre.utils.fonts.sfnt.container import Sfnt
 from calibre.utils.ipc.simple_worker  import fork_job, WorkerError
 from calibre.utils.logging import default_log
 from calibre.utils.zipfile import ZipFile
@ -43,6 +46,7 @@ class Container(object):
        self.parsed_cache = {}
        self.mime_map = {}
        self.name_path_map = {}
        self.dirtied = set()
        # Map of relative paths with '/' separators from root of unzipped ePub
        # to absolute paths on filesystem with os-specific separators
@ -141,8 +145,6 @@ class Container(object):
            data = self.parse_xml(data)
        elif mime in OEB_STYLES:
            data = self.parse_css(data, self.relpath(path))
        elif mime in OEB_FONTS or path.rpartition('.')[-1].lower() in {'ttf', 'otf'}:
            data = Sfnt(data)
        return data
    def parse_css(self, data, fname):
@ -189,6 +191,64 @@ class Container(object):
        for path in non_linear:
            yield path
    def remove_item(self, name):
        '''
        Remove the item identified by name from this container. This remove all
        references to the item in the OPF manifest, guide and spine as well as from
        any internal caches.
        '''
        removed = set()
        for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
                                   namespaces={'opf':OPF2_NS}):
            if self.href_to_name(elem.get('href')) == name:
                id_ = elem.get('id', None)
                if id_ is not None:
                    removed.add(id_)
                elem.getparent().remove(elem)
        if removed:
            for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
                                    namespaces={'opf':OPF2_NS}):
                idref = item.get('idref')
                if idref in removed:
                    item.getparent().remove(item)
        for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
                                    namespaces={'opf':OPF2_NS}):
            if self.href_to_name(item.get('href')) == name:
                item.getparent().remove(item)
        path = self.name_path_map.pop(name)
        if os.path.exists(path):
            os.remove(path)
        self.mime_map.pop(name, None)
        self.parsed_cache.pop(name, None)
        self.dirtied.discard(name)
    def dirty(self, name):
        self.dirtied.add(name)
    def commit(self, outpath=None):
        for name in self.dirtied:
            self.dirtied.remove(name)
            data = self.parsed_cache.pop(name)
            data = serialize(data)
            with open(self.name_path_map[name], 'wb') as f:
                f.write(data)
    def compare_to(self, other):
        if set(self.name_path_map) != set(other.name_path_map):
            return ['Set of files is not the same']
        mismatches = []
        for name, path in self.name_path_map.iteritems():
            opath = other.name_path_map[name]
            with open(path, 'rb') as f1, open(opath, 'rb') as f2:
                if f1.read() != f2.read():
                    mismatches.append('The file %s is not the same'%name)
                    import subprocess
                    subprocess.call(['kompare', path, opath])
        return '\n'.join(mismatches)
 # EPUB {{{
 class InvalidEpub(InvalidBook):
    pass
@ -294,8 +354,24 @@ class EpubContainer(Container):
            if not tkey:
                raise InvalidBook('Failed to find obfuscation key')
            decrypt_font(tkey, path, alg)
-            self.obfuscated_fonts[name] = (alg, tkey)
+            self.obfuscated_fonts[font] = (alg, tkey)
    def commit(self, outpath=None):
        super(EpubContainer, self).commit()
        for name in self.obfuscated_fonts:
            if name not in self.name_path_map:
                continue
            alg, key = self.obfuscated_fonts[name]
            # Decrypting and encrypting are the same operation (XOR with key)
            decrypt_font(key, self.name_path_map[name], alg)
        if outpath is None:
            outpath = self.pathtoepub
        from calibre.ebooks.tweak import zip_rebuilder
        zip_rebuilder(self.root, outpath)
 # }}}
 # AZW3 {{{
 class InvalidMobi(InvalidBook):
    pass
@ -357,14 +433,40 @@ class AZW3Container(Container):
        super(AZW3Container, self).__init__(tdir, opf_path, log)
        self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
    def commit(self, outpath=None):
        super(AZW3Container, self).commit()
        if outpath is None:
            outpath = self.pathtoazw3
        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
        opf = self.name_path_map[self.opf_name]
        plumber = Plumber(opf, outpath, self.log)
        plumber.setup_options()
        inp = plugin_for_input_format('azw3')
        outp = plugin_for_output_format('azw3')
        plumber.opts.mobi_passthrough = True
        oeb = create_oebbook(default_log, opf, plumber.opts)
        set_cover(oeb)
        outp.convert(oeb, outpath, inp, plumber.opts, default_log)
 # }}}
 def get_container(path, log=None):
    if log is None: log = default_log
    ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
            else EpubContainer)(path, log)
    return ebook
-if __name__ == '__main__':
+def test_roundtrip():
    ebook = get_container(sys.argv[-1])
-    for s in ebook.spine_items:
+    p = PersistentTemporaryFile(suffix='.'+sys.argv[-1].rpartition('.')[-1])
-        print (ebook.relpath(s))
+    p.close()
    ebook.commit(outpath=p.name)
    ebook2 = get_container(p.name)
    ebook3 = get_container(p.name)
    diff = ebook3.compare_to(ebook2)
    if diff is not None:
        print (diff)
 if __name__ == '__main__':
    test_roundtrip()