mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Various encoding fix-ups. Fix for broken file(s?) from Penguin.
This commit is contained in:
parent
006182e5f4
commit
a349d76379
@ -8,6 +8,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
|
|
||||||
import sys, struct, cStringIO, os
|
import sys, struct, cStringIO, os
|
||||||
import functools
|
import functools
|
||||||
|
import codecs
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
|
|
||||||
from calibre import relpath
|
from calibre import relpath
|
||||||
@ -33,7 +34,6 @@ HTML_DECL = """<?xml version="1.0" encoding="UTF-8" ?>
|
|||||||
DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}"
|
DESENCRYPT_GUID = "{67F6E4A2-60BF-11D3-8540-00C04F58C3CF}"
|
||||||
LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}"
|
LZXCOMPRESS_GUID = "{0A9007C6-4076-11D3-8789-0000F8105754}"
|
||||||
|
|
||||||
LZXC_TAG = 0x43585a4c
|
|
||||||
CONTROL_TAG = 4
|
CONTROL_TAG = 4
|
||||||
CONTROL_WINDOW_SIZE = 12
|
CONTROL_WINDOW_SIZE = 12
|
||||||
RESET_NENTRIES = 4
|
RESET_NENTRIES = 4
|
||||||
@ -41,11 +41,11 @@ RESET_HDRLEN = 12
|
|||||||
RESET_UCLENGTH = 16
|
RESET_UCLENGTH = 16
|
||||||
RESET_INTERVAL = 32
|
RESET_INTERVAL = 32
|
||||||
|
|
||||||
FLAG_OPENING = 1
|
FLAG_OPENING = (1 << 0)
|
||||||
FLAG_CLOSING = 2
|
FLAG_CLOSING = (1 << 1)
|
||||||
FLAG_BLOCK = 4
|
FLAG_BLOCK = (1 << 2)
|
||||||
FLAG_HEAD = 8
|
FLAG_HEAD = (1 << 3)
|
||||||
FLAG_ATOM = 16
|
FLAG_ATOM = (1 << 4)
|
||||||
XML_ENTITIES = ['&', ''', '<', '>', '"']
|
XML_ENTITIES = ['&', ''', '<', '>', '"']
|
||||||
|
|
||||||
def u32(bytes):
|
def u32(bytes):
|
||||||
@ -202,7 +202,7 @@ class UnBinary(object):
|
|||||||
is_goingdown = False
|
is_goingdown = False
|
||||||
if not tag_name:
|
if not tag_name:
|
||||||
raise LitError('Tag ends before it begins.')
|
raise LitError('Tag ends before it begins.')
|
||||||
self.buf.write('</'+tag_name+'>')
|
self.buf.write(u''.join(('</', tag_name, '>')).encode('utf-8'))
|
||||||
dynamic_tag = 0
|
dynamic_tag = 0
|
||||||
tag_name = None
|
tag_name = None
|
||||||
state = 'text'
|
state = 'text'
|
||||||
@ -252,7 +252,7 @@ class UnBinary(object):
|
|||||||
state = 'get attr'
|
state = 'get attr'
|
||||||
elif count > 0:
|
elif count > 0:
|
||||||
if not in_censorship:
|
if not in_censorship:
|
||||||
self.buf.write(c)
|
self.buf.write(unicode(c).encode('utf-8'))
|
||||||
count -= 1
|
count -= 1
|
||||||
if count == 0:
|
if count == 0:
|
||||||
if not in_censorship:
|
if not in_censorship:
|
||||||
@ -272,7 +272,7 @@ class UnBinary(object):
|
|||||||
tag_name += c
|
tag_name += c
|
||||||
count -= 1
|
count -= 1
|
||||||
if count == 0:
|
if count == 0:
|
||||||
self.buf.write(tag_name)
|
self.buf.write(unicode(tag_name).encode('utf-8'))
|
||||||
state = 'get attr'
|
state = 'get attr'
|
||||||
|
|
||||||
elif state == 'get attr length':
|
elif state == 'get attr length':
|
||||||
@ -283,7 +283,7 @@ class UnBinary(object):
|
|||||||
state = 'get custom attr'
|
state = 'get custom attr'
|
||||||
|
|
||||||
elif state == 'get custom attr':
|
elif state == 'get custom attr':
|
||||||
self.buf.write(c)
|
self.buf.write(unicode(c).encode('utf-8'))
|
||||||
count -= 1
|
count -= 1
|
||||||
if count == 0:
|
if count == 0:
|
||||||
self.buf.write('=')
|
self.buf.write('=')
|
||||||
@ -592,6 +592,12 @@ class LitReader(object):
|
|||||||
|
|
||||||
def _read_meta(self):
|
def _read_meta(self):
|
||||||
raw = self.get_file('/meta')
|
raw = self.get_file('/meta')
|
||||||
|
try:
|
||||||
|
xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP))
|
||||||
|
except LitError:
|
||||||
|
if 'PENGUIN group' not in raw: raise
|
||||||
|
print "WARNING: attempting PENGUIN malformed OPF fix"
|
||||||
|
raw = raw.replace('PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1)
|
||||||
xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP))
|
xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP))
|
||||||
self.meta = xml
|
self.meta = xml
|
||||||
|
|
||||||
@ -669,8 +675,8 @@ class LitReader(object):
|
|||||||
control = control[csize:]
|
control = control[csize:]
|
||||||
elif guid == LZXCOMPRESS_GUID:
|
elif guid == LZXCOMPRESS_GUID:
|
||||||
reset_table = self.get_file(
|
reset_table = self.get_file(
|
||||||
'/'.join(['::DataSpace/Storage', name, 'Transform',
|
'/'.join(('::DataSpace/Storage', name, 'Transform',
|
||||||
LZXCOMPRESS_GUID, 'InstanceData/ResetTable']))
|
LZXCOMPRESS_GUID, 'InstanceData/ResetTable')))
|
||||||
content = self._decompress(content, control, reset_table)
|
content = self._decompress(content, control, reset_table)
|
||||||
control = control[csize:]
|
control = control[csize:]
|
||||||
else:
|
else:
|
||||||
@ -684,7 +690,7 @@ class LitReader(object):
|
|||||||
return msdes.new(self.bookkey).decrypt(content)
|
return msdes.new(self.bookkey).decrypt(content)
|
||||||
|
|
||||||
def _decompress(self, content, control, reset_table):
|
def _decompress(self, content, control, reset_table):
|
||||||
if len(control) < 32 or u32(control[CONTROL_TAG:]) != LZXC_TAG:
|
if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC":
|
||||||
raise LitError("Invalid ControlData tag value")
|
raise LitError("Invalid ControlData tag value")
|
||||||
if len(reset_table) < (RESET_INTERVAL + 8):
|
if len(reset_table) < (RESET_INTERVAL + 8):
|
||||||
raise LitError("Reset table is too short")
|
raise LitError("Reset table is too short")
|
||||||
@ -743,16 +749,16 @@ class LitReader(object):
|
|||||||
opf_path = os.path.join(output_dir, opf_path)
|
opf_path = os.path.join(output_dir, opf_path)
|
||||||
self._ensure_dir(opf_path)
|
self._ensure_dir(opf_path)
|
||||||
with open(opf_path, 'w') as f:
|
with open(opf_path, 'w') as f:
|
||||||
f.write(self.get_markup_file('/meta').encode('utf-8'))
|
f.write(self.meta.encode('utf-8'))
|
||||||
for entry in self.manifest.values():
|
for entry in self.manifest.values():
|
||||||
path = os.path.join(output_dir, entry.path)
|
path = os.path.join(output_dir, entry.path)
|
||||||
self._ensure_dir(path)
|
self._ensure_dir(path)
|
||||||
with open(path, 'w') as f:
|
with open(path, 'w') as f:
|
||||||
if 'spine' in entry.state:
|
if 'spine' in entry.state:
|
||||||
name = '/'.join(['/data', entry.internal, 'content'])
|
name = '/'.join(('/data', entry.internal, 'content'))
|
||||||
f.write(self.get_markup_file(name).encode('utf-8'))
|
f.write(self.get_markup_file(name).encode('utf-8'))
|
||||||
else:
|
else:
|
||||||
name = '/'.join(['/data', entry.internal])
|
name = '/'.join(('/data', entry.internal))
|
||||||
f.write(self.get_file(name))
|
f.write(self.get_file(name))
|
||||||
|
|
||||||
def _ensure_dir(self, path):
|
def _ensure_dir(self, path):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user