py3: Port LIT Output

This commit is contained in:
Kovid Goyal 2019-04-11 12:11:52 +05:30
parent 73f58e6868
commit c569f857bb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 78 additions and 80 deletions

View File

@ -1,9 +1,9 @@
from __future__ import absolute_import, division, print_function, unicode_literals
""" """
Modified version of SHA-1 used in Microsoft LIT files. Modified version of SHA-1 used in Microsoft LIT files.
Adapted from the PyPy pure-Python SHA-1 implementation. Adapted from the PyPy pure-Python SHA-1 implementation.
""" """
from __future__ import print_function
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
@ -28,44 +28,36 @@ def _long2bytesBigEndian(n, blocksize=0):
""" """
# After much testing, this algorithm was deemed to be the fastest. # After much testing, this algorithm was deemed to be the fastest.
s = '' s = b''
pack = struct.pack pack = struct.pack
while n > 0: while n > 0:
s = pack('>I', n & 0xffffffff) + s s = pack('>I', n & 0xffffffff) + s
n = n >> 32 n = n >> 32
# Strip off leading zeros. # Strip off leading zeros.
for i in range(len(s)): s = s.lstrip(b'\0')
if s[i] != '\000':
break
else:
# Only happens when n == 0.
s = '\000'
i = 0
s = s[i:]
# Add back some pad bytes. This could be done more efficiently # Add back some pad bytes. This could be done more efficiently
# w.r.t. the de-padding being done above, but sigh... # w.r.t. the de-padding being done above, but sigh...
if blocksize > 0 and len(s) % blocksize: if blocksize > 0 and len(s) % blocksize:
s = (blocksize - len(s) % blocksize) * '\000' + s s = (blocksize - len(s) % blocksize) * b'\000' + s
return s return s
def _bytelist2longBigEndian(list): def _bytelist2longBigEndian(blist):
"Transform a list of characters into a list of longs." "Transform a list of characters into a list of longs."
imax = len(list)/4 imax = len(blist)//4
hl = [0] * imax hl = [0] * imax
j = 0 j = 0
i = 0 i = 0
while i < imax: while i < imax:
b0 = long_type(ord(list[j])) << 24 b0 = long_type(blist[j]) << 24
b1 = long_type(ord(list[j+1])) << 16 b1 = long_type(blist[j+1]) << 16
b2 = long_type(ord(list[j+2])) << 8 b2 = long_type(blist[j+2]) << 8
b3 = long_type(ord(list[j+3])) b3 = long_type(blist[j+3])
hl[i] = b0 | b1 | b2 | b3 hl[i] = b0 | b1 | b2 | b3
i = i+1 i = i+1
j = j+4 j = j+4
@ -140,7 +132,7 @@ class mssha1(object):
self.count = [0, 0] self.count = [0, 0]
# Initial empty message as a sequence of bytes (8 bit characters). # Initial empty message as a sequence of bytes (8 bit characters).
self.input = [] self.input = bytearray()
# Call a separate init function, that can be used repeatedly # Call a separate init function, that can be used repeatedly
# to start from scratch on the same object. # to start from scratch on the same object.
@ -172,7 +164,7 @@ class mssha1(object):
E = self.H4 E = self.H4
for t in range(0, 80): for t in range(0, 80):
TEMP = _rotateLeft(A, 5) + f[t](B, C, D) + E + W[t] + K[t/20] TEMP = _rotateLeft(A, 5) + f[t](B, C, D) + E + W[t] + K[t//20]
E = D E = D
D = C D = C
C = _rotateLeft(B, 30) & 0xffffffff C = _rotateLeft(B, 30) & 0xffffffff
@ -204,6 +196,7 @@ class mssha1(object):
to the hashed string. to the hashed string.
""" """
inBuf = bytearray(inBuf)
leninBuf = long_type(len(inBuf)) leninBuf = long_type(len(inBuf))
# Compute number of bytes mod 64. # Compute number of bytes mod 64.
@ -218,17 +211,17 @@ class mssha1(object):
partLen = 64 - index partLen = 64 - index
if leninBuf >= partLen: if leninBuf >= partLen:
self.input[index:] = list(inBuf[:partLen]) self.input[index:] = inBuf[:partLen]
self._transform(_bytelist2longBigEndian(self.input)) self._transform(_bytelist2longBigEndian(self.input))
i = partLen i = partLen
while i + 63 < leninBuf: while i + 63 < leninBuf:
self._transform(_bytelist2longBigEndian(list(inBuf[i:i+64]))) self._transform(_bytelist2longBigEndian(inBuf[i:i+64]))
i = i + 64 i = i + 64
else: else:
self.input = list(inBuf[i:leninBuf]) self.input = inBuf[i:leninBuf]
else: else:
i = 0 i = 0
self.input = self.input + list(inBuf) self.input = self.input + inBuf
def digest(self): def digest(self):
"""Terminate the message-digest computation and return digest. """Terminate the message-digest computation and return digest.
@ -243,7 +236,7 @@ class mssha1(object):
H2 = self.H2 H2 = self.H2
H3 = self.H3 H3 = self.H3
H4 = self.H4 H4 = self.H4
input = [] + self.input inp = bytearray(self.input)
count = [] + self.count count = [] + self.count
index = (self.count[1] >> 3) & 0x3f index = (self.count[1] >> 3) & 0x3f
@ -253,7 +246,7 @@ class mssha1(object):
else: else:
padLen = 120 - index padLen = 120 - index
padding = ['\200'] + ['\000'] * 63 padding = b'\200' + (b'\000' * 63)
self.update(padding[:padLen]) self.update(padding[:padLen])
# Append length (before padding). # Append length (before padding).
@ -273,7 +266,7 @@ class mssha1(object):
self.H2 = H2 self.H2 = H2
self.H3 = H3 self.H3 = H3
self.H4 = H4 self.H4 = H4
self.input = input self.input = inp
self.count = count self.count = count
return digest return digest
@ -286,7 +279,7 @@ class mssha1(object):
used to exchange the value safely in email or other non- used to exchange the value safely in email or other non-
binary environments. binary environments.
""" """
return ''.join(['%02x' % ord(c) for c in self.digest()]) return ''.join(['%02x' % c for c in bytearray(self.digest())])
def copy(self): def copy(self):
"""Return a clone object. """Return a clone object.

View File

@ -1,14 +1,14 @@
from __future__ import absolute_import, division, print_function, unicode_literals
''' '''
Basic support for writing LIT files. Basic support for writing LIT files.
''' '''
from __future__ import with_statement
from __future__ import print_function
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
from struct import pack from struct import pack
from itertools import count, chain from itertools import count, chain
from operator import attrgetter
import io import io
import time import time
import random import random
@ -30,7 +30,7 @@ import calibre
from calibre import plugins from calibre import plugins
msdes, msdeserror = plugins['msdes'] msdes, msdeserror = plugins['msdes']
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1
from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes, range, zip from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes, range, zip, native_string_type
from polyglot.urllib import urldefrag, unquote from polyglot.urllib import urldefrag, unquote
__all__ = ['LitWriter'] __all__ = ['LitWriter']
@ -62,7 +62,7 @@ def invert_tag_map(tag_map):
OPF_MAP = invert_tag_map(maps.OPF_MAP) OPF_MAP = invert_tag_map(maps.OPF_MAP)
HTML_MAP = invert_tag_map(maps.HTML_MAP) HTML_MAP = invert_tag_map(maps.HTML_MAP)
LIT_MAGIC = 'ITOLITLS' LIT_MAGIC = b'ITOLITLS'
LITFILE_GUID = "{0A9007C1-4076-11D3-8789-0000F8105754}" LITFILE_GUID = "{0A9007C1-4076-11D3-8789-0000F8105754}"
PIECE3_GUID = "{0A9007C3-4076-11D3-8789-0000F8105754}" PIECE3_GUID = "{0A9007C3-4076-11D3-8789-0000F8105754}"
@ -97,24 +97,24 @@ ROOT_OFFSET = 1284508585713721976
ROOT_SIZE = 4165955342166943123 ROOT_SIZE = 4165955342166943123
BLOCK_CAOL = \ BLOCK_CAOL = \
"\x43\x41\x4f\x4c\x02\x00\x00\x00" \ b"\x43\x41\x4f\x4c\x02\x00\x00\x00" \
"\x50\x00\x00\x00\x37\x13\x03\x00" \ b"\x50\x00\x00\x00\x37\x13\x03\x00" \
"\x00\x00\x00\x00\x00\x20\x00\x00" \ b"\x00\x00\x00\x00\x00\x20\x00\x00" \
"\x00\x02\x00\x00\x00\x00\x10\x00" \ b"\x00\x02\x00\x00\x00\x00\x10\x00" \
"\x00\x00\x02\x00\x00\x00\x00\x00" \ b"\x00\x00\x02\x00\x00\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00"
BLOCK_ITSF = \ BLOCK_ITSF = \
"\x49\x54\x53\x46\x04\x00\x00\x00" \ b"\x49\x54\x53\x46\x04\x00\x00\x00" \
"\x20\x00\x00\x00\x01\x00\x00\x00" b"\x20\x00\x00\x00\x01\x00\x00\x00"
MSDES_CONTROL = \ MSDES_CONTROL = \
"\x03\x00\x00\x00\x29\x17\x00\x00" \ b"\x03\x00\x00\x00\x29\x17\x00\x00" \
"\x01\x00\x00\x00\xa5\xa5\x00\x00" b"\x01\x00\x00\x00\xa5\xa5\x00\x00"
LZXC_CONTROL = \ LZXC_CONTROL = \
"\x07\x00\x00\x00\x4c\x5a\x58\x43" \ b"\x07\x00\x00\x00\x4c\x5a\x58\x43" \
"\x03\x00\x00\x00\x04\x00\x00\x00" \ b"\x03\x00\x00\x00\x04\x00\x00\x00" \
"\x04\x00\x00\x00\x02\x00\x00\x00" \ b"\x04\x00\x00\x00\x02\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00"
COLLAPSE = re.compile(r'[ \t\r\n\v]+') COLLAPSE = re.compile(r'[ \t\r\n\v]+')
@ -122,16 +122,16 @@ PAGE_BREAKS = {'always', 'left', 'right'}
def decint(value): def decint(value):
bytes = [] ans = bytearray()
while True: while True:
b = value & 0x7f b = value & 0x7f
value >>= 7 value >>= 7
if bytes: if len(ans):
b |= 0x80 b |= 0x80
bytes.append(chr(b)) ans.append(b)
if value == 0: if value == 0:
break break
return ''.join(reversed(bytes)) return bytes(bytearray(reversed(ans)))
def randbytes(n): def randbytes(n):
@ -366,7 +366,7 @@ class LitWriter(object):
self._write(packguid(LITFILE_GUID)) self._write(packguid(LITFILE_GUID))
offset = self._tell() offset = self._tell()
pieces = list(range(offset, offset + (PIECE_SIZE * 5), PIECE_SIZE)) pieces = list(range(offset, offset + (PIECE_SIZE * 5), PIECE_SIZE))
self._write((5 * PIECE_SIZE) * '\0') self._write((5 * PIECE_SIZE) * b'\0')
aoli1 = len(dchunks) if ichunk else ULL_NEG1 aoli1 = len(dchunks) if ichunk else ULL_NEG1
last = len(dchunks) - 1 last = len(dchunks) - 1
ddepth = 2 if ichunk else 1 ddepth = 2 if ichunk else 1
@ -391,7 +391,7 @@ class LitWriter(object):
# Piece #1: Directory chunks # Piece #1: Directory chunks
piece1_offset = self._tell() piece1_offset = self._tell()
number = len(dchunks) + ((ichunk and 1) or 0) number = len(dchunks) + ((ichunk and 1) or 0)
self._write('IFCM', pack('<IIIQQ', self._write(b'IFCM', pack('<IIIQQ',
1, DCHUNK_SIZE, 0x100000, ULL_NEG1, number)) 1, DCHUNK_SIZE, 0x100000, ULL_NEG1, number))
for dchunk in dchunks: for dchunk in dchunks:
self._write(dchunk) self._write(dchunk)
@ -402,7 +402,7 @@ class LitWriter(object):
# Piece #2: Count chunks # Piece #2: Count chunks
piece2_offset = self._tell() piece2_offset = self._tell()
self._write('IFCM', pack('<IIIQQ', self._write(b'IFCM', pack('<IIIQQ',
1, CCHUNK_SIZE, 0x20000, ULL_NEG1, 1)) 1, CCHUNK_SIZE, 0x20000, ULL_NEG1, 1))
cchunk = io.BytesIO() cchunk = io.BytesIO()
last = 0 last = 0
@ -413,9 +413,9 @@ class LitWriter(object):
last = dcount last = dcount
cchunk = cchunk.getvalue() cchunk = cchunk.getvalue()
rem = CCHUNK_SIZE - (len(cchunk) + 50) rem = CCHUNK_SIZE - (len(cchunk) + 50)
self._write('AOLL', pack('<IQQQQQ', self._write(b'AOLL', pack('<IQQQQQ',
rem, 0, ULL_NEG1, ULL_NEG1, 0, 1)) rem, 0, ULL_NEG1, ULL_NEG1, 0, 1))
filler = '\0' * rem filler = b'\0' * rem
self._write(cchunk, filler, pack('<H', len(dcounts))) self._write(cchunk, filler, pack('<H', len(dcounts)))
self._writeat(pieces[2], pack('<QQ', self._writeat(pieces[2], pack('<QQ',
piece2_offset, self._tell() - piece2_offset)) piece2_offset, self._tell() - piece2_offset))
@ -491,7 +491,7 @@ class LitWriter(object):
elif isinstance(data, unicode_type): elif isinstance(data, unicode_type):
data = data.encode('utf-8') data = data.encode('utf-8')
elif hasattr(data, 'cssText'): elif hasattr(data, 'cssText'):
data = str(item) data = item.bytes_representation
self._add_file(name, data, secnum) self._add_file(name, data, secnum)
item.size = len(data) item.size = len(data)
@ -507,10 +507,10 @@ class LitWriter(object):
elif item.media_type in LIT_IMAGES: elif item.media_type in LIT_IMAGES:
manifest['images'].append(item) manifest['images'].append(item)
data = io.BytesIO() data = io.BytesIO()
data.write(pack('<Bc', 1, '\\')) data.write(pack('<Bc', 1, b'\\'))
offset = 0 offset = 0
for state in states: for state in states:
items = sorted(manifest[state]) items = sorted(manifest[state], key=attrgetter('sort_key'))
data.write(pack('<I', len(items))) data.write(pack('<I', len(items)))
for item in items: for item in items:
id, media_type = item.id, item.media_type id, media_type = item.id, item.media_type
@ -528,7 +528,7 @@ class LitWriter(object):
codepoint_to_chr(len(media_type)), unicode_type(media_type)] codepoint_to_chr(len(media_type)), unicode_type(media_type)]
for value in entry: for value in entry:
data.write(value.encode('utf-8')) data.write(value.encode('utf-8'))
data.write('\0') data.write(b'\0')
offset += item.size offset += item.size
self._add_file('/manifest', data.getvalue()) self._add_file('/manifest', data.getvalue())
@ -572,7 +572,7 @@ class LitWriter(object):
_, meta = self._oeb.to_opf1()[OPF_MIME] _, meta = self._oeb.to_opf1()[OPF_MIME]
meta.attrib['ms--minimum_level'] = '0' meta.attrib['ms--minimum_level'] = '0'
meta.attrib['ms--attr5'] = '1' meta.attrib['ms--attr5'] = '1'
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper() meta.attrib['ms--guid'] = '{%s}' % native_string_type(uuid.uuid4()).upper()
rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP) rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP)
meta = rebin.content meta = rebin.content
self._meta = meta self._meta = meta
@ -583,9 +583,9 @@ class LitWriter(object):
self._add_file('/DRMStorage/DRMSource', drmsource) self._add_file('/DRMStorage/DRMSource', drmsource)
tempkey = self._calculate_deskey([self._meta, drmsource]) tempkey = self._calculate_deskey([self._meta, drmsource])
msdes.deskey(tempkey, msdes.EN0) msdes.deskey(tempkey, msdes.EN0)
self._add_file('/DRMStorage/DRMSealed', msdes.des("\0" * 16)) self._add_file('/DRMStorage/DRMSealed', msdes.des(b"\0" * 16))
self._bookkey = '\0' * 8 self._bookkey = b'\0' * 8
self._add_file('/DRMStorage/ValidationStream', 'MSReader', 3) self._add_file('/DRMStorage/ValidationStream', b'MSReader', 3)
def _build_version(self): def _build_version(self):
self._add_file('/Version', pack('<HH', 8, 1)) self._add_file('/Version', pack('<HH', 8, 1))
@ -598,7 +598,7 @@ class LitWriter(object):
for name in names: for name in names:
data.write(pack('<H', len(name))) data.write(pack('<H', len(name)))
data.write(name.encode('utf-16-le')) data.write(name.encode('utf-16-le'))
data.write('\0\0') data.write(b'\0\0')
self._add_file('::DataSpace/NameList', data.getvalue()) self._add_file('::DataSpace/NameList', data.getvalue())
def _build_storage(self): def _build_storage(self):
@ -608,7 +608,7 @@ class LitWriter(object):
for secnum, name, transforms in mapping: for secnum, name, transforms in mapping:
root = '::DataSpace/Storage/' + name root = '::DataSpace/Storage/' + name
data = self._sections[secnum].getvalue() data = self._sections[secnum].getvalue()
cdata, sdata, tdata, rdata = '', '', '', '' cdata, sdata, tdata, rdata = b'', b'', b'', b''
for guid in transforms: for guid in transforms:
tdata = packguid(guid) + tdata tdata = packguid(guid) + tdata
sdata = sdata + pack('<Q', len(data)) sdata = sdata + pack('<Q', len(data))
@ -619,7 +619,7 @@ class LitWriter(object):
msdes.deskey(self._bookkey, msdes.EN0) msdes.deskey(self._bookkey, msdes.EN0)
pad = 8 - (len(data) & 0x7) pad = 8 - (len(data) & 0x7)
if pad != 8: if pad != 8:
data = data + ('\0' * pad) data = data + (b'\0' * pad)
data = msdes.des(data) data = msdes.des(data)
elif guid == LZXCOMPRESS_GUID: elif guid == LZXCOMPRESS_GUID:
cdata = LZXC_CONTROL + cdata cdata = LZXC_CONTROL + cdata
@ -655,17 +655,20 @@ class LitWriter(object):
hash = mssha1.new() hash = mssha1.new()
for data in hashdata: for data in hashdata:
if prepad > 0: if prepad > 0:
data = ("\000" * prepad) + data data = (b"\000" * prepad) + data
prepad = 0 prepad = 0
postpad = 64 - (len(data) % 64) postpad = 64 - (len(data) % 64)
if postpad < 64: if postpad < 64:
data = data + ("\000" * postpad) data = data + (b"\000" * postpad)
hash.update(data) hash.update(data)
digest = hash.digest() digest = hash.digest()
key = [0] * 8 if not isinstance(digest, bytes):
for i in range(0, len(digest)): digest = digest.encode('ascii')
key[i % 8] ^= ord(digest[i]) digest = bytearray(digest)
return ''.join(chr(x) for x in key) key = bytearray(8)
for i, k in enumerate(digest):
key[i % 8] ^= k
return bytes(key)
def _build_dchunks(self): def _build_dchunks(self):
ddata = [] ddata = []
@ -677,11 +680,13 @@ class LitWriter(object):
quickref = [] quickref = []
name = directory[0].name name = directory[0].name
for entry in directory: for entry in directory:
en = entry.name.encode('utf-8') if entry.name else entry.name en = entry.name
next = ''.join([decint(len(en)), en, if not isinstance(en, bytes):
en = en.encode('utf-8')
nxt = b''.join([decint(len(en)), en,
decint(entry.section), decint(entry.offset), decint(entry.section), decint(entry.offset),
decint(entry.size)]) decint(entry.size)])
usedlen = dchunk.tell() + len(next) + (len(quickref) * 2) + 52 usedlen = dchunk.tell() + len(nxt) + (len(quickref) * 2) + 52
if usedlen >= DCHUNK_SIZE: if usedlen >= DCHUNK_SIZE:
ddata.append((dchunk.getvalue(), quickref, dcount, name)) ddata.append((dchunk.getvalue(), quickref, dcount, name))
dchunk = io.BytesIO() dchunk = io.BytesIO()
@ -690,7 +695,7 @@ class LitWriter(object):
name = en name = en
if (dcount % qrn) == 0: if (dcount % qrn) == 0:
quickref.append(dchunk.tell()) quickref.append(dchunk.tell())
dchunk.write(next) dchunk.write(nxt)
dcount = dcount + 1 dcount = dcount + 1
ddata.append((dchunk.getvalue(), quickref, dcount, name)) ddata.append((dchunk.getvalue(), quickref, dcount, name))
cidmax = len(ddata) - 1 cidmax = len(ddata) - 1
@ -706,10 +711,10 @@ class LitWriter(object):
next = cid + 1 if cid < cidmax else ULL_NEG1 next = cid + 1 if cid < cidmax else ULL_NEG1
rem = DCHUNK_SIZE - (len(content) + 50) rem = DCHUNK_SIZE - (len(content) + 50)
pad = rem - (len(quickref) * 2) pad = rem - (len(quickref) * 2)
dchunk.write('AOLL') dchunk.write(b'AOLL')
dchunk.write(pack('<IQQQQQ', rem, cid, prev, next, rdcount, 1)) dchunk.write(pack('<IQQQQQ', rem, cid, prev, next, rdcount, 1))
dchunk.write(content) dchunk.write(content)
dchunk.write('\0' * pad) dchunk.write(b'\0' * pad)
for ref in reversed(quickref): for ref in reversed(quickref):
dchunk.write(pack('<H', ref)) dchunk.write(pack('<H', ref))
dchunk.write(pack('<H', dcount)) dchunk.write(pack('<H', dcount))
@ -723,6 +728,6 @@ class LitWriter(object):
if ichunk: if ichunk:
rem = DCHUNK_SIZE - (ichunk.tell() + 16) rem = DCHUNK_SIZE - (ichunk.tell() + 16)
pad = rem - 2 pad = rem - 2
ichunk = ''.join(['AOLI', pack('<IQ', rem, len(dchunks)), ichunk = b''.join([b'AOLI', pack('<IQ', rem, len(dchunks)),
ichunk.getvalue(), ('\0' * pad), pack('<H', len(dchunks))]) ichunk.getvalue(), (b'\0' * pad), pack('<H', len(dchunks))])
return dcounts, dchunks, ichunk return dcounts, dchunks, ichunk