mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
ebook-polish: Roundtripping for both epub and azw3.
This commit is contained in:
parent
d661b15ae2
commit
09429e3c1c
@ -337,6 +337,24 @@ def xml2unicode(root, pretty_print=False):
|
|||||||
def xml2text(elem):
|
def xml2text(elem):
|
||||||
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
|
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
|
||||||
|
|
||||||
|
def serialize(data, media_type, pretty_print=False):
|
||||||
|
if isinstance(data, etree._Element):
|
||||||
|
ans = xml2str(data, pretty_print=pretty_print)
|
||||||
|
if media_type in OEB_DOCS:
|
||||||
|
# Convert self closing div|span|a|video|audio|iframe|etc tags
|
||||||
|
# to normally closed ones, as they are interpreted
|
||||||
|
# incorrectly by some browser based renderers
|
||||||
|
ans = close_self_closing_tags(ans)
|
||||||
|
return ans
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return data.encode('utf-8')
|
||||||
|
if hasattr(data, 'cssText'):
|
||||||
|
data = data.cssText
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
return data + b'\n'
|
||||||
|
return bytes(data)
|
||||||
|
|
||||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||||
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
||||||
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
@ -960,23 +978,7 @@ class Manifest(object):
|
|||||||
self._data = None
|
self._data = None
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
data = self.data
|
return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
|
||||||
if isinstance(data, etree._Element):
|
|
||||||
ans = xml2str(data, pretty_print=self.oeb.pretty_print)
|
|
||||||
if self.media_type in OEB_DOCS:
|
|
||||||
# Convert self closing div|span|a|video|audio|iframe|etc tags
|
|
||||||
# to normally closed ones, as they are interpreted
|
|
||||||
# incorrectly by some browser based renderers
|
|
||||||
ans = close_self_closing_tags(ans)
|
|
||||||
return ans
|
|
||||||
if isinstance(data, unicode):
|
|
||||||
return data.encode('utf-8')
|
|
||||||
if hasattr(data, 'cssText'):
|
|
||||||
data = data.cssText
|
|
||||||
if isinstance(data, unicode):
|
|
||||||
data = data.encode('utf-8')
|
|
||||||
return data + b'\n'
|
|
||||||
return str(data)
|
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
data = self.data
|
data = self.data
|
||||||
|
@ -13,17 +13,20 @@ from urllib import unquote as urlunquote
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import guess_type, CurrentDir
|
from calibre import guess_type, CurrentDir
|
||||||
|
from calibre.customize.ui import (plugin_for_input_format,
|
||||||
|
plugin_for_output_format)
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.conversion.plugins.epub_input import (
|
from calibre.ebooks.conversion.plugins.epub_input import (
|
||||||
ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font)
|
ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font)
|
||||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor
|
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor
|
||||||
from calibre.ebooks.mobi import MobiError
|
from calibre.ebooks.mobi import MobiError
|
||||||
from calibre.ebooks.mobi.reader.headers import MetadataHeader
|
from calibre.ebooks.mobi.reader.headers import MetadataHeader
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS
|
from calibre.ebooks.mobi.tweak import set_cover
|
||||||
|
from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
|
||||||
|
OEB_STYLES, OPF2_NS)
|
||||||
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
|
||||||
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||||
from calibre.utils.fonts.sfnt.container import Sfnt
|
|
||||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
@ -43,6 +46,7 @@ class Container(object):
|
|||||||
self.parsed_cache = {}
|
self.parsed_cache = {}
|
||||||
self.mime_map = {}
|
self.mime_map = {}
|
||||||
self.name_path_map = {}
|
self.name_path_map = {}
|
||||||
|
self.dirtied = set()
|
||||||
|
|
||||||
# Map of relative paths with '/' separators from root of unzipped ePub
|
# Map of relative paths with '/' separators from root of unzipped ePub
|
||||||
# to absolute paths on filesystem with os-specific separators
|
# to absolute paths on filesystem with os-specific separators
|
||||||
@ -141,8 +145,6 @@ class Container(object):
|
|||||||
data = self.parse_xml(data)
|
data = self.parse_xml(data)
|
||||||
elif mime in OEB_STYLES:
|
elif mime in OEB_STYLES:
|
||||||
data = self.parse_css(data, self.relpath(path))
|
data = self.parse_css(data, self.relpath(path))
|
||||||
elif mime in OEB_FONTS or path.rpartition('.')[-1].lower() in {'ttf', 'otf'}:
|
|
||||||
data = Sfnt(data)
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def parse_css(self, data, fname):
|
def parse_css(self, data, fname):
|
||||||
@ -189,6 +191,64 @@ class Container(object):
|
|||||||
for path in non_linear:
|
for path in non_linear:
|
||||||
yield path
|
yield path
|
||||||
|
|
||||||
|
def remove_item(self, name):
|
||||||
|
'''
|
||||||
|
Remove the item identified by name from this container. This remove all
|
||||||
|
references to the item in the OPF manifest, guide and spine as well as from
|
||||||
|
any internal caches.
|
||||||
|
'''
|
||||||
|
removed = set()
|
||||||
|
for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
|
||||||
|
namespaces={'opf':OPF2_NS}):
|
||||||
|
if self.href_to_name(elem.get('href')) == name:
|
||||||
|
id_ = elem.get('id', None)
|
||||||
|
if id_ is not None:
|
||||||
|
removed.add(id_)
|
||||||
|
elem.getparent().remove(elem)
|
||||||
|
if removed:
|
||||||
|
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
|
||||||
|
namespaces={'opf':OPF2_NS}):
|
||||||
|
idref = item.get('idref')
|
||||||
|
if idref in removed:
|
||||||
|
item.getparent().remove(item)
|
||||||
|
|
||||||
|
for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
|
||||||
|
namespaces={'opf':OPF2_NS}):
|
||||||
|
if self.href_to_name(item.get('href')) == name:
|
||||||
|
item.getparent().remove(item)
|
||||||
|
|
||||||
|
path = self.name_path_map.pop(name)
|
||||||
|
if os.path.exists(path):
|
||||||
|
os.remove(path)
|
||||||
|
self.mime_map.pop(name, None)
|
||||||
|
self.parsed_cache.pop(name, None)
|
||||||
|
self.dirtied.discard(name)
|
||||||
|
|
||||||
|
def dirty(self, name):
|
||||||
|
self.dirtied.add(name)
|
||||||
|
|
||||||
|
def commit(self, outpath=None):
|
||||||
|
for name in self.dirtied:
|
||||||
|
self.dirtied.remove(name)
|
||||||
|
data = self.parsed_cache.pop(name)
|
||||||
|
data = serialize(data)
|
||||||
|
with open(self.name_path_map[name], 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
def compare_to(self, other):
|
||||||
|
if set(self.name_path_map) != set(other.name_path_map):
|
||||||
|
return ['Set of files is not the same']
|
||||||
|
mismatches = []
|
||||||
|
for name, path in self.name_path_map.iteritems():
|
||||||
|
opath = other.name_path_map[name]
|
||||||
|
with open(path, 'rb') as f1, open(opath, 'rb') as f2:
|
||||||
|
if f1.read() != f2.read():
|
||||||
|
mismatches.append('The file %s is not the same'%name)
|
||||||
|
import subprocess
|
||||||
|
subprocess.call(['kompare', path, opath])
|
||||||
|
return '\n'.join(mismatches)
|
||||||
|
|
||||||
|
# EPUB {{{
|
||||||
class InvalidEpub(InvalidBook):
|
class InvalidEpub(InvalidBook):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -294,8 +354,24 @@ class EpubContainer(Container):
|
|||||||
if not tkey:
|
if not tkey:
|
||||||
raise InvalidBook('Failed to find obfuscation key')
|
raise InvalidBook('Failed to find obfuscation key')
|
||||||
decrypt_font(tkey, path, alg)
|
decrypt_font(tkey, path, alg)
|
||||||
self.obfuscated_fonts[name] = (alg, tkey)
|
self.obfuscated_fonts[font] = (alg, tkey)
|
||||||
|
|
||||||
|
def commit(self, outpath=None):
|
||||||
|
super(EpubContainer, self).commit()
|
||||||
|
for name in self.obfuscated_fonts:
|
||||||
|
if name not in self.name_path_map:
|
||||||
|
continue
|
||||||
|
alg, key = self.obfuscated_fonts[name]
|
||||||
|
# Decrypting and encrypting are the same operation (XOR with key)
|
||||||
|
decrypt_font(key, self.name_path_map[name], alg)
|
||||||
|
if outpath is None:
|
||||||
|
outpath = self.pathtoepub
|
||||||
|
from calibre.ebooks.tweak import zip_rebuilder
|
||||||
|
zip_rebuilder(self.root, outpath)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
# AZW3 {{{
|
||||||
class InvalidMobi(InvalidBook):
|
class InvalidMobi(InvalidBook):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -357,14 +433,40 @@ class AZW3Container(Container):
|
|||||||
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
super(AZW3Container, self).__init__(tdir, opf_path, log)
|
||||||
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
|
||||||
|
|
||||||
|
def commit(self, outpath=None):
|
||||||
|
super(AZW3Container, self).commit()
|
||||||
|
if outpath is None:
|
||||||
|
outpath = self.pathtoazw3
|
||||||
|
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
|
||||||
|
opf = self.name_path_map[self.opf_name]
|
||||||
|
plumber = Plumber(opf, outpath, self.log)
|
||||||
|
plumber.setup_options()
|
||||||
|
inp = plugin_for_input_format('azw3')
|
||||||
|
outp = plugin_for_output_format('azw3')
|
||||||
|
plumber.opts.mobi_passthrough = True
|
||||||
|
oeb = create_oebbook(default_log, opf, plumber.opts)
|
||||||
|
set_cover(oeb)
|
||||||
|
outp.convert(oeb, outpath, inp, plumber.opts, default_log)
|
||||||
|
# }}}
|
||||||
|
|
||||||
def get_container(path, log=None):
|
def get_container(path, log=None):
|
||||||
if log is None: log = default_log
|
if log is None: log = default_log
|
||||||
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
|
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
|
||||||
else EpubContainer)(path, log)
|
else EpubContainer)(path, log)
|
||||||
return ebook
|
return ebook
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def test_roundtrip():
|
||||||
ebook = get_container(sys.argv[-1])
|
ebook = get_container(sys.argv[-1])
|
||||||
for s in ebook.spine_items:
|
p = PersistentTemporaryFile(suffix='.'+sys.argv[-1].rpartition('.')[-1])
|
||||||
print (ebook.relpath(s))
|
p.close()
|
||||||
|
ebook.commit(outpath=p.name)
|
||||||
|
ebook2 = get_container(p.name)
|
||||||
|
ebook3 = get_container(p.name)
|
||||||
|
diff = ebook3.compare_to(ebook2)
|
||||||
|
if diff is not None:
|
||||||
|
print (diff)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_roundtrip()
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user