py3: Port LIT Output

This commit is contained in:
Kovid Goyal 2019-04-11 12:11:52 +05:30
parent 73f58e6868
commit c569f857bb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 78 additions and 80 deletions

View File

@ -1,9 +1,9 @@
from __future__ import absolute_import, division, print_function, unicode_literals
"""
Modified version of SHA-1 used in Microsoft LIT files.
Adapted from the PyPy pure-Python SHA-1 implementation.
"""
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
@ -28,44 +28,36 @@ def _long2bytesBigEndian(n, blocksize=0):
"""
# After much testing, this algorithm was deemed to be the fastest.
s = ''
s = b''
pack = struct.pack
while n > 0:
s = pack('>I', n & 0xffffffff) + s
n = n >> 32
# Strip off leading zeros.
for i in range(len(s)):
if s[i] != '\000':
break
else:
# Only happens when n == 0.
s = '\000'
i = 0
s = s[i:]
s = s.lstrip(b'\0')
# Add back some pad bytes. This could be done more efficiently
# w.r.t. the de-padding being done above, but sigh...
if blocksize > 0 and len(s) % blocksize:
s = (blocksize - len(s) % blocksize) * '\000' + s
s = (blocksize - len(s) % blocksize) * b'\000' + s
return s
def _bytelist2longBigEndian(list):
def _bytelist2longBigEndian(blist):
"Transform a list of characters into a list of longs."
imax = len(list)/4
imax = len(blist)//4
hl = [0] * imax
j = 0
i = 0
while i < imax:
b0 = long_type(ord(list[j])) << 24
b1 = long_type(ord(list[j+1])) << 16
b2 = long_type(ord(list[j+2])) << 8
b3 = long_type(ord(list[j+3]))
b0 = long_type(blist[j]) << 24
b1 = long_type(blist[j+1]) << 16
b2 = long_type(blist[j+2]) << 8
b3 = long_type(blist[j+3])
hl[i] = b0 | b1 | b2 | b3
i = i+1
j = j+4
@ -140,7 +132,7 @@ class mssha1(object):
self.count = [0, 0]
# Initial empty message as a sequence of bytes (8 bit characters).
self.input = []
self.input = bytearray()
# Call a separate init function, that can be used repeatedly
# to start from scratch on the same object.
@ -172,7 +164,7 @@ class mssha1(object):
E = self.H4
for t in range(0, 80):
TEMP = _rotateLeft(A, 5) + f[t](B, C, D) + E + W[t] + K[t/20]
TEMP = _rotateLeft(A, 5) + f[t](B, C, D) + E + W[t] + K[t//20]
E = D
D = C
C = _rotateLeft(B, 30) & 0xffffffff
@ -204,6 +196,7 @@ class mssha1(object):
to the hashed string.
"""
inBuf = bytearray(inBuf)
leninBuf = long_type(len(inBuf))
# Compute number of bytes mod 64.
@ -218,17 +211,17 @@ class mssha1(object):
partLen = 64 - index
if leninBuf >= partLen:
self.input[index:] = list(inBuf[:partLen])
self.input[index:] = inBuf[:partLen]
self._transform(_bytelist2longBigEndian(self.input))
i = partLen
while i + 63 < leninBuf:
self._transform(_bytelist2longBigEndian(list(inBuf[i:i+64])))
self._transform(_bytelist2longBigEndian(inBuf[i:i+64]))
i = i + 64
else:
self.input = list(inBuf[i:leninBuf])
self.input = inBuf[i:leninBuf]
else:
i = 0
self.input = self.input + list(inBuf)
self.input = self.input + inBuf
def digest(self):
"""Terminate the message-digest computation and return digest.
@ -243,7 +236,7 @@ class mssha1(object):
H2 = self.H2
H3 = self.H3
H4 = self.H4
input = [] + self.input
inp = bytearray(self.input)
count = [] + self.count
index = (self.count[1] >> 3) & 0x3f
@ -253,7 +246,7 @@ class mssha1(object):
else:
padLen = 120 - index
padding = ['\200'] + ['\000'] * 63
padding = b'\200' + (b'\000' * 63)
self.update(padding[:padLen])
# Append length (before padding).
@ -273,7 +266,7 @@ class mssha1(object):
self.H2 = H2
self.H3 = H3
self.H4 = H4
self.input = input
self.input = inp
self.count = count
return digest
@ -286,7 +279,7 @@ class mssha1(object):
used to exchange the value safely in email or other non-
binary environments.
"""
return ''.join(['%02x' % ord(c) for c in self.digest()])
return ''.join(['%02x' % c for c in bytearray(self.digest())])
def copy(self):
"""Return a clone object.

View File

@ -1,14 +1,14 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Basic support for writing LIT files.
'''
from __future__ import with_statement
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from struct import pack
from itertools import count, chain
from operator import attrgetter
import io
import time
import random
@ -30,7 +30,7 @@ import calibre
from calibre import plugins
msdes, msdeserror = plugins['msdes']
import calibre.ebooks.lit.mssha1 as mssha1
from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes, range, zip
from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes, range, zip, native_string_type
from polyglot.urllib import urldefrag, unquote
__all__ = ['LitWriter']
@ -62,7 +62,7 @@ def invert_tag_map(tag_map):
OPF_MAP = invert_tag_map(maps.OPF_MAP)
HTML_MAP = invert_tag_map(maps.HTML_MAP)
LIT_MAGIC = 'ITOLITLS'
LIT_MAGIC = b'ITOLITLS'
LITFILE_GUID = "{0A9007C1-4076-11D3-8789-0000F8105754}"
PIECE3_GUID = "{0A9007C3-4076-11D3-8789-0000F8105754}"
@ -97,24 +97,24 @@ ROOT_OFFSET = 1284508585713721976
ROOT_SIZE = 4165955342166943123
BLOCK_CAOL = \
"\x43\x41\x4f\x4c\x02\x00\x00\x00" \
"\x50\x00\x00\x00\x37\x13\x03\x00" \
"\x00\x00\x00\x00\x00\x20\x00\x00" \
"\x00\x02\x00\x00\x00\x00\x10\x00" \
"\x00\x00\x02\x00\x00\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00"
b"\x43\x41\x4f\x4c\x02\x00\x00\x00" \
b"\x50\x00\x00\x00\x37\x13\x03\x00" \
b"\x00\x00\x00\x00\x00\x20\x00\x00" \
b"\x00\x02\x00\x00\x00\x00\x10\x00" \
b"\x00\x00\x02\x00\x00\x00\x00\x00" \
b"\x00\x00\x00\x00\x00\x00\x00\x00"
BLOCK_ITSF = \
"\x49\x54\x53\x46\x04\x00\x00\x00" \
"\x20\x00\x00\x00\x01\x00\x00\x00"
b"\x49\x54\x53\x46\x04\x00\x00\x00" \
b"\x20\x00\x00\x00\x01\x00\x00\x00"
MSDES_CONTROL = \
"\x03\x00\x00\x00\x29\x17\x00\x00" \
"\x01\x00\x00\x00\xa5\xa5\x00\x00"
b"\x03\x00\x00\x00\x29\x17\x00\x00" \
b"\x01\x00\x00\x00\xa5\xa5\x00\x00"
LZXC_CONTROL = \
"\x07\x00\x00\x00\x4c\x5a\x58\x43" \
"\x03\x00\x00\x00\x04\x00\x00\x00" \
"\x04\x00\x00\x00\x02\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00"
b"\x07\x00\x00\x00\x4c\x5a\x58\x43" \
b"\x03\x00\x00\x00\x04\x00\x00\x00" \
b"\x04\x00\x00\x00\x02\x00\x00\x00" \
b"\x00\x00\x00\x00\x00\x00\x00\x00"
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
@ -122,16 +122,16 @@ PAGE_BREAKS = {'always', 'left', 'right'}
def decint(value):
bytes = []
ans = bytearray()
while True:
b = value & 0x7f
value >>= 7
if bytes:
if len(ans):
b |= 0x80
bytes.append(chr(b))
ans.append(b)
if value == 0:
break
return ''.join(reversed(bytes))
return bytes(bytearray(reversed(ans)))
def randbytes(n):
@ -366,7 +366,7 @@ class LitWriter(object):
self._write(packguid(LITFILE_GUID))
offset = self._tell()
pieces = list(range(offset, offset + (PIECE_SIZE * 5), PIECE_SIZE))
self._write((5 * PIECE_SIZE) * '\0')
self._write((5 * PIECE_SIZE) * b'\0')
aoli1 = len(dchunks) if ichunk else ULL_NEG1
last = len(dchunks) - 1
ddepth = 2 if ichunk else 1
@ -391,7 +391,7 @@ class LitWriter(object):
# Piece #1: Directory chunks
piece1_offset = self._tell()
number = len(dchunks) + ((ichunk and 1) or 0)
self._write('IFCM', pack('<IIIQQ',
self._write(b'IFCM', pack('<IIIQQ',
1, DCHUNK_SIZE, 0x100000, ULL_NEG1, number))
for dchunk in dchunks:
self._write(dchunk)
@ -402,7 +402,7 @@ class LitWriter(object):
# Piece #2: Count chunks
piece2_offset = self._tell()
self._write('IFCM', pack('<IIIQQ',
self._write(b'IFCM', pack('<IIIQQ',
1, CCHUNK_SIZE, 0x20000, ULL_NEG1, 1))
cchunk = io.BytesIO()
last = 0
@ -413,9 +413,9 @@ class LitWriter(object):
last = dcount
cchunk = cchunk.getvalue()
rem = CCHUNK_SIZE - (len(cchunk) + 50)
self._write('AOLL', pack('<IQQQQQ',
self._write(b'AOLL', pack('<IQQQQQ',
rem, 0, ULL_NEG1, ULL_NEG1, 0, 1))
filler = '\0' * rem
filler = b'\0' * rem
self._write(cchunk, filler, pack('<H', len(dcounts)))
self._writeat(pieces[2], pack('<QQ',
piece2_offset, self._tell() - piece2_offset))
@ -491,7 +491,7 @@ class LitWriter(object):
elif isinstance(data, unicode_type):
data = data.encode('utf-8')
elif hasattr(data, 'cssText'):
data = str(item)
data = item.bytes_representation
self._add_file(name, data, secnum)
item.size = len(data)
@ -507,10 +507,10 @@ class LitWriter(object):
elif item.media_type in LIT_IMAGES:
manifest['images'].append(item)
data = io.BytesIO()
data.write(pack('<Bc', 1, '\\'))
data.write(pack('<Bc', 1, b'\\'))
offset = 0
for state in states:
items = sorted(manifest[state])
items = sorted(manifest[state], key=attrgetter('sort_key'))
data.write(pack('<I', len(items)))
for item in items:
id, media_type = item.id, item.media_type
@ -528,7 +528,7 @@ class LitWriter(object):
codepoint_to_chr(len(media_type)), unicode_type(media_type)]
for value in entry:
data.write(value.encode('utf-8'))
data.write('\0')
data.write(b'\0')
offset += item.size
self._add_file('/manifest', data.getvalue())
@ -572,7 +572,7 @@ class LitWriter(object):
_, meta = self._oeb.to_opf1()[OPF_MIME]
meta.attrib['ms--minimum_level'] = '0'
meta.attrib['ms--attr5'] = '1'
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
meta.attrib['ms--guid'] = '{%s}' % native_string_type(uuid.uuid4()).upper()
rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP)
meta = rebin.content
self._meta = meta
@ -583,9 +583,9 @@ class LitWriter(object):
self._add_file('/DRMStorage/DRMSource', drmsource)
tempkey = self._calculate_deskey([self._meta, drmsource])
msdes.deskey(tempkey, msdes.EN0)
self._add_file('/DRMStorage/DRMSealed', msdes.des("\0" * 16))
self._bookkey = '\0' * 8
self._add_file('/DRMStorage/ValidationStream', 'MSReader', 3)
self._add_file('/DRMStorage/DRMSealed', msdes.des(b"\0" * 16))
self._bookkey = b'\0' * 8
self._add_file('/DRMStorage/ValidationStream', b'MSReader', 3)
def _build_version(self):
self._add_file('/Version', pack('<HH', 8, 1))
@ -598,7 +598,7 @@ class LitWriter(object):
for name in names:
data.write(pack('<H', len(name)))
data.write(name.encode('utf-16-le'))
data.write('\0\0')
data.write(b'\0\0')
self._add_file('::DataSpace/NameList', data.getvalue())
def _build_storage(self):
@ -608,7 +608,7 @@ class LitWriter(object):
for secnum, name, transforms in mapping:
root = '::DataSpace/Storage/' + name
data = self._sections[secnum].getvalue()
cdata, sdata, tdata, rdata = '', '', '', ''
cdata, sdata, tdata, rdata = b'', b'', b'', b''
for guid in transforms:
tdata = packguid(guid) + tdata
sdata = sdata + pack('<Q', len(data))
@ -619,7 +619,7 @@ class LitWriter(object):
msdes.deskey(self._bookkey, msdes.EN0)
pad = 8 - (len(data) & 0x7)
if pad != 8:
data = data + ('\0' * pad)
data = data + (b'\0' * pad)
data = msdes.des(data)
elif guid == LZXCOMPRESS_GUID:
cdata = LZXC_CONTROL + cdata
@ -655,17 +655,20 @@ class LitWriter(object):
hash = mssha1.new()
for data in hashdata:
if prepad > 0:
data = ("\000" * prepad) + data
data = (b"\000" * prepad) + data
prepad = 0
postpad = 64 - (len(data) % 64)
if postpad < 64:
data = data + ("\000" * postpad)
data = data + (b"\000" * postpad)
hash.update(data)
digest = hash.digest()
key = [0] * 8
for i in range(0, len(digest)):
key[i % 8] ^= ord(digest[i])
return ''.join(chr(x) for x in key)
if not isinstance(digest, bytes):
digest = digest.encode('ascii')
digest = bytearray(digest)
key = bytearray(8)
for i, k in enumerate(digest):
key[i % 8] ^= k
return bytes(key)
def _build_dchunks(self):
ddata = []
@ -677,11 +680,13 @@ class LitWriter(object):
quickref = []
name = directory[0].name
for entry in directory:
en = entry.name.encode('utf-8') if entry.name else entry.name
next = ''.join([decint(len(en)), en,
en = entry.name
if not isinstance(en, bytes):
en = en.encode('utf-8')
nxt = b''.join([decint(len(en)), en,
decint(entry.section), decint(entry.offset),
decint(entry.size)])
usedlen = dchunk.tell() + len(next) + (len(quickref) * 2) + 52
usedlen = dchunk.tell() + len(nxt) + (len(quickref) * 2) + 52
if usedlen >= DCHUNK_SIZE:
ddata.append((dchunk.getvalue(), quickref, dcount, name))
dchunk = io.BytesIO()
@ -690,7 +695,7 @@ class LitWriter(object):
name = en
if (dcount % qrn) == 0:
quickref.append(dchunk.tell())
dchunk.write(next)
dchunk.write(nxt)
dcount = dcount + 1
ddata.append((dchunk.getvalue(), quickref, dcount, name))
cidmax = len(ddata) - 1
@ -706,10 +711,10 @@ class LitWriter(object):
next = cid + 1 if cid < cidmax else ULL_NEG1
rem = DCHUNK_SIZE - (len(content) + 50)
pad = rem - (len(quickref) * 2)
dchunk.write('AOLL')
dchunk.write(b'AOLL')
dchunk.write(pack('<IQQQQQ', rem, cid, prev, next, rdcount, 1))
dchunk.write(content)
dchunk.write('\0' * pad)
dchunk.write(b'\0' * pad)
for ref in reversed(quickref):
dchunk.write(pack('<H', ref))
dchunk.write(pack('<H', dcount))
@ -723,6 +728,6 @@ class LitWriter(object):
if ichunk:
rem = DCHUNK_SIZE - (ichunk.tell() + 16)
pad = rem - 2
ichunk = ''.join(['AOLI', pack('<IQ', rem, len(dchunks)),
ichunk.getvalue(), ('\0' * pad), pack('<H', len(dchunks))])
ichunk = b''.join([b'AOLI', pack('<IQ', rem, len(dchunks)),
ichunk.getvalue(), (b'\0' * pad), pack('<H', len(dchunks))])
return dcounts, dchunks, ichunk