py3: port LIT Input

This commit is contained in:
Kovid Goyal 2019-04-11 14:44:32 +05:30
parent 94d5b27128
commit b972584f4b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 71 additions and 57 deletions

View File

@ -569,7 +569,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
if encoding is None or num > 255: if encoding is None or num > 255:
return check(my_unichr(num)) return check(my_unichr(num))
try: try:
return check(chr(num).decode(encoding)) return check(bytes(bytearray((num,))).decode(encoding))
except UnicodeDecodeError: except UnicodeDecodeError:
return check(my_unichr(num)) return check(my_unichr(num))
from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.html_entities import html5_entities

View File

@ -1,8 +1,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals
''' '''
Support for reading LIT files. Support for reading LIT files.
''' '''
from __future__ import with_statement
from __future__ import print_function
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
@ -18,8 +17,9 @@ import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.oeb.base import urlnormalize, xpath from calibre.ebooks.oeb.base import urlnormalize, xpath
from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
from calibre.constants import ispy3
from calibre import plugins from calibre import plugins
from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes, range from polyglot.builtins import codepoint_to_chr, unicode_type, string_or_bytes, range, itervalues
from polyglot.urllib import unquote as urlunquote, urldefrag from polyglot.urllib import unquote as urlunquote, urldefrag
lzx, lxzerror = plugins['lzx'] lzx, lxzerror = plugins['lzx']
@ -69,17 +69,18 @@ def int32(bytes):
return struct.unpack('<l', bytes[:4])[0] return struct.unpack('<l', bytes[:4])[0]
def encint(bytes, remaining): def encint(byts, remaining):
pos, val = 0, 0 pos, val = 0, 0
ba = bytearray(byts)
while remaining > 0: while remaining > 0:
b = ord(bytes[pos]) b = ba[pos]
pos += 1 pos += 1
remaining -= 1 remaining -= 1
val <<= 7 val <<= 7
val |= (b & 0x7f) val |= (b & 0x7f)
if b & 0x80 == 0: if b & 0x80 == 0:
break break
return val, bytes[pos:], remaining return val, byts[pos:], remaining
def msguid(bytes): def msguid(bytes):
@ -88,7 +89,7 @@ def msguid(bytes):
def read_utf8_char(bytes, pos): def read_utf8_char(bytes, pos):
c = ord(bytes[pos]) c = ord(bytes[pos:pos+1])
mask = 0x80 mask = 0x80
if (c & mask): if (c & mask):
elsize = 0 elsize = 0
@ -104,7 +105,7 @@ def read_utf8_char(bytes, pos):
raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos]))
c &= (mask - 1) c &= (mask - 1)
for i in range(1, elsize): for i in range(1, elsize):
b = ord(bytes[pos+i]) b = ord(bytes[pos+i:pos+i+1])
if (b & 0xC0) != 0x80: if (b & 0xC0) != 0x80:
raise LitError( raise LitError(
'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i])) 'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i]))
@ -118,7 +119,7 @@ def consume_sized_utf8_string(bytes, zpad=False):
for i in range(ord(slen)): for i in range(ord(slen)):
char, pos = read_utf8_char(bytes, pos) char, pos = read_utf8_char(bytes, pos)
result.append(char) result.append(char)
if zpad and bytes[pos] == '\000': if zpad and bytes[pos:pos+1] == b'\0':
pos += 1 pos += 1
return u''.join(result), bytes[pos:] return u''.join(result), bytes[pos:]
@ -129,10 +130,10 @@ def encode(string):
class UnBinary(object): class UnBinary(object):
AMPERSAND_RE = re.compile( AMPERSAND_RE = re.compile(
r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') br'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)')
OPEN_ANGLE_RE = re.compile(r'<<(?![!]--)') OPEN_ANGLE_RE = re.compile(br'<<(?![!]--)')
CLOSE_ANGLE_RE = re.compile(r'(?<!--)>>(?=>>|[^>])') CLOSE_ANGLE_RE = re.compile(br'(?<!--)>>(?=>>|[^>])')
DOUBLE_ANGLE_RE = re.compile(r'([<>])\1') DOUBLE_ANGLE_RE = re.compile(br'([<>])\1')
EMPTY_ATOMS = ({},{}) EMPTY_ATOMS = ({},{})
def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS): def __init__(self, bin, path, manifest={}, map=HTML_MAP, atoms=EMPTY_ATOMS):
@ -149,10 +150,10 @@ class UnBinary(object):
def escape_reserved(self): def escape_reserved(self):
raw = self.raw raw = self.raw
raw = self.AMPERSAND_RE.sub(r'&amp;', raw) raw = self.AMPERSAND_RE.sub(br'&amp;', raw)
raw = self.OPEN_ANGLE_RE.sub(r'&lt;', raw) raw = self.OPEN_ANGLE_RE.sub(br'&lt;', raw)
raw = self.CLOSE_ANGLE_RE.sub(r'&gt;', raw) raw = self.CLOSE_ANGLE_RE.sub(br'&gt;', raw)
raw = self.DOUBLE_ANGLE_RE.sub(r'\1', raw) raw = self.DOUBLE_ANGLE_RE.sub(br'\1', raw)
self.raw = raw self.raw = raw
def item_path(self, internal_id): def item_path(self, internal_id):
@ -172,11 +173,19 @@ class UnBinary(object):
relpath = (['..'] * (len(base) - index)) + target[index:] relpath = (['..'] * (len(base) - index)) + target[index:]
return '/'.join(relpath) return '/'.join(relpath)
def __unicode__(self): @property
def binary_representation(self):
return self.raw
@property
def unicode_representation(self):
return self.raw.decode('utf-8') return self.raw.decode('utf-8')
def __unicode__(self):
return self.unicode_representation
def __str__(self): def __str__(self):
return self.raw return self.unicode_representation if ispy3 else self.binary_representation
def binary_to_text(self, bin, buf): def binary_to_text(self, bin, buf):
stack = [(0, None, None, 0, 0, False, False, 'text', 0)] stack = [(0, None, None, 0, 0, False, False, 'text', 0)]
@ -320,7 +329,9 @@ class UnBinary(object):
c = '&quot;' c = '&quot;'
elif c == '<': elif c == '<':
c = '&lt;' c = '&lt;'
buf.write(c.encode('ascii', 'xmlcharrefreplace')) if isinstance(c, unicode_type):
c = c.encode('ascii', 'xmlcharrefreplace')
buf.write(c)
count -= 1 count -= 1
if count == 0: if count == 0:
if not in_censorship: if not in_censorship:
@ -449,7 +460,7 @@ class LitFile(object):
os.path.basename(self.stream.name))[0] + '.opf' os.path.basename(self.stream.name))[0] + '.opf'
except AttributeError: except AttributeError:
self.opf_path = 'content.opf' self.opf_path = 'content.opf'
if self.magic != 'ITOLITLS': if self.magic != b'ITOLITLS':
raise LitError('Not a valid LIT file') raise LitError('Not a valid LIT file')
if self.version != 1: if self.version != 1:
raise LitError('Unknown LIT version %d' % (self.version,)) raise LitError('Unknown LIT version %d' % (self.version,))
@ -535,30 +546,30 @@ class LitFile(object):
def read_secondary_header(self): def read_secondary_header(self):
offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE)
bytes = self.read_raw(offset, self.sec_hdr_len) byts = self.read_raw(offset, self.sec_hdr_len)
offset = int32(bytes[4:]) offset = int32(byts[4:])
while offset < len(bytes): while offset < len(byts):
blocktype = bytes[offset:offset+4] blocktype = byts[offset:offset+4]
blockver = u32(bytes[offset+4:]) blockver = u32(byts[offset+4:])
if blocktype == 'CAOL': if blocktype == b'CAOL':
if blockver != 2: if blockver != 2:
raise LitError( raise LitError(
'Unknown CAOL block format %d' % blockver) 'Unknown CAOL block format %d' % blockver)
self.creator_id = u32(bytes[offset+12:]) self.creator_id = u32(byts[offset+12:])
self.entry_chunklen = u32(bytes[offset+20:]) self.entry_chunklen = u32(byts[offset+20:])
self.count_chunklen = u32(bytes[offset+24:]) self.count_chunklen = u32(byts[offset+24:])
self.entry_unknown = u32(bytes[offset+28:]) self.entry_unknown = u32(byts[offset+28:])
self.count_unknown = u32(bytes[offset+32:]) self.count_unknown = u32(byts[offset+32:])
offset += 48 offset += 48
elif blocktype == 'ITSF': elif blocktype == b'ITSF':
if blockver != 4: if blockver != 4:
raise LitError( raise LitError(
'Unknown ITSF block format %d' % blockver) 'Unknown ITSF block format %d' % blockver)
if u32(bytes[offset+4+16:]): if u32(byts[offset+4+16:]):
raise LitError('This file has a 64bit content offset') raise LitError('This file has a 64bit content offset')
self.content_offset = u32(bytes[offset+16:]) self.content_offset = u32(byts[offset+16:])
self.timestamp = u32(bytes[offset+24:]) self.timestamp = u32(byts[offset+24:])
self.language_id = u32(bytes[offset+28:]) self.language_id = u32(byts[offset+28:])
offset += 48 offset += 48
if not hasattr(self, 'content_offset'): if not hasattr(self, 'content_offset'):
raise LitError('Could not figure out the content offset') raise LitError('Could not figure out the content offset')
@ -589,7 +600,7 @@ class LitFile(object):
self.piece4_guid = piece self.piece4_guid = piece
def read_directory(self, piece): def read_directory(self, piece):
if not piece.startswith('IFCM'): if not piece.startswith(b'IFCM'):
raise LitError('Header piece #1 is not main directory.') raise LitError('Header piece #1 is not main directory.')
chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28])
if (32 + (num_chunks * chunk_size)) != len(piece): if (32 + (num_chunks * chunk_size)) != len(piece):
@ -599,7 +610,7 @@ class LitFile(object):
offset = 32 + (i * chunk_size) offset = 32 + (i * chunk_size)
chunk = piece[offset:offset + chunk_size] chunk = piece[offset:offset + chunk_size]
tag, chunk = chunk[:4], chunk[4:] tag, chunk = chunk[:4], chunk[4:]
if tag != 'AOLL': if tag != b'AOLL':
continue continue
remaining, chunk = int32(chunk[:4]), chunk[4:] remaining, chunk = int32(chunk[:4]), chunk[4:]
if remaining >= chunk_size: if remaining >= chunk_size:
@ -647,7 +658,7 @@ class LitFile(object):
if pos + size > len(raw): if pos + size > len(raw):
raise LitError('Invalid Namelist section') raise LitError('Invalid Namelist section')
self.section_names[section] = \ self.section_names[section] = \
raw[pos:pos+size].decode('utf-16-le').rstrip('\000') raw[pos:pos+size].decode('utf-16-le').rstrip('\0')
pos += size pos += size
def read_manifest(self): def read_manifest(self):
@ -657,7 +668,7 @@ class LitFile(object):
self.manifest = {} self.manifest = {}
self.paths = {self.opf_path: None} self.paths = {self.opf_path: None}
while raw: while raw:
slen, raw = ord(raw[0]), raw[1:] slen, raw = ord(raw[0:1]), raw[1:]
if slen == 0: if slen == 0:
break break
root, raw = raw[:slen].decode('utf8'), raw[slen:] root, raw = raw[:slen].decode('utf8'), raw[slen:]
@ -679,7 +690,7 @@ class LitFile(object):
mime_type, raw = consume_sized_utf8_string(raw, zpad=True) mime_type, raw = consume_sized_utf8_string(raw, zpad=True)
self.manifest[internal] = ManifestItem( self.manifest[internal] = ManifestItem(
original, internal, mime_type, offset, root, state) original, internal, mime_type, offset, root, state)
mlist = self.manifest.values() mlist = list(itervalues(self.manifest))
# Remove any common path elements # Remove any common path elements
if len(mlist) > 1: if len(mlist) > 1:
shared = mlist[0].path shared = mlist[0].path
@ -715,7 +726,7 @@ class LitFile(object):
if self.drmlevel < 5: if self.drmlevel < 5:
msdes.deskey(self.calculate_deskey(), msdes.DE1) msdes.deskey(self.calculate_deskey(), msdes.DE1)
bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed'))
if bookkey[0] != '\000': if bookkey[0:1] != b'\0':
raise LitError('Unable to decrypt title key!') raise LitError('Unable to decrypt title key!')
self.bookkey = bookkey[1:9] self.bookkey = bookkey[1:9]
else: else:
@ -730,17 +741,20 @@ class LitFile(object):
for name in hashfiles: for name in hashfiles:
data = self.get_file(name) data = self.get_file(name)
if prepad > 0: if prepad > 0:
data = ("\000" * prepad) + data data = (b"\000" * prepad) + data
prepad = 0 prepad = 0
postpad = 64 - (len(data) % 64) postpad = 64 - (len(data) % 64)
if postpad < 64: if postpad < 64:
data = data + ("\000" * postpad) data = data + (b"\000" * postpad)
hash.update(data) hash.update(data)
digest = hash.digest() digest = hash.digest()
key = [0] * 8 if not isinstance(digest, bytes):
for i in range(0, len(digest)): digest = digest.encode('ascii')
key[i % 8] ^= ord(digest[i]) digest = bytearray(digest)
return ''.join(chr(x) for x in key) key = bytearray(8)
for i, d in enumerate(digest):
key[i % 8] ^= d
return bytes(key)
def get_file(self, name): def get_file(self, name):
entry = self.entries[name] entry = self.entries[name]
@ -786,12 +800,12 @@ class LitFile(object):
extra = length & 0x7 extra = length & 0x7
if extra > 0: if extra > 0:
self.warn("content length not a multiple of block size") self.warn("content length not a multiple of block size")
content += "\0" * (8 - extra) content += b"\0" * (8 - extra)
msdes.deskey(self.bookkey, msdes.DE1) msdes.deskey(self.bookkey, msdes.DE1)
return msdes.des(content) return msdes.des(content)
def decompress(self, content, control, reset_table): def decompress(self, content, control, reset_table):
if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != b"LZXC":
raise LitError("Invalid ControlData tag value") raise LitError("Invalid ControlData tag value")
if len(reset_table) < (RESET_INTERVAL + 8): if len(reset_table) < (RESET_INTERVAL + 8):
raise LitError("Reset table is too short") raise LitError("Reset table is too short")
@ -845,7 +859,7 @@ class LitFile(object):
bytes_remaining = 0 bytes_remaining = 0
if bytes_remaining > 0: if bytes_remaining > 0:
raise LitError("Failed to completely decompress section") raise LitError("Failed to completely decompress section")
return ''.join(result) return b''.join(result)
def get_atoms(self, entry): def get_atoms(self, entry):
name = '/'.join(('/data', entry.internal, 'atom')) name = '/'.join(('/data', entry.internal, 'atom'))
@ -902,7 +916,7 @@ class LitContainer(object):
manifest = self._litfile.manifest manifest = self._litfile.manifest
atoms = self._litfile.get_atoms(entry) atoms = self._litfile.get_atoms(entry)
unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms) unbin = UnBinary(raw, name, manifest, HTML_MAP, atoms)
content = HTML_DECL + str(unbin) content = HTML_DECL + unbin.unicode_representation
tags = ('personname', 'place', 'city', 'country-region') tags = ('personname', 'place', 'city', 'country-region')
pat = r'(?i)</{0,1}st1:(%s)>'%('|'.join(tags)) pat = r'(?i)</{0,1}st1:(%s)>'%('|'.join(tags))
content = re.sub(pat, '', content) content = re.sub(pat, '', content)
@ -918,13 +932,13 @@ class LitContainer(object):
try: try:
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
except LitError: except LitError:
if 'PENGUIN group' not in raw: if b'PENGUIN group' not in raw:
raise raise
print("WARNING: attempting PENGUIN malformed OPF fix") print("WARNING: attempting PENGUIN malformed OPF fix")
raw = raw.replace( raw = raw.replace(
'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) b'PENGUIN group', b'\x00\x01\x18\x00PENGUIN group', 1)
unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP)
return str(unbin) return unbin.unicode_representation
def get_metadata(self): def get_metadata(self):
return self._read_meta() return self._read_meta()