Lift "trailing entry" handling out of huff/cdic decompression and to general mobipocket book processing

This commit is contained in:
Marshall T. Vandegrift 2008-07-10 23:14:18 -04:00
parent 40be364965
commit 40ef97e9a3
2 changed files with 30 additions and 27 deletions

View File

@ -33,7 +33,7 @@ class BitReader(object):
class HuffReader(object): class HuffReader(object):
def __init__(self, huffs, extra_flags): def __init__(self, huffs, extra_flags):
self.huffs, self.extra_flags = huffs, extra_flags self.huffs = huffs
if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18': if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header') raise MobiError('Invalid HUFF header')
@ -84,32 +84,10 @@ class HuffReader(object):
self._unpack(BitReader(data)) self._unpack(BitReader(data))
return self.r return self.r
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0
while True:
v = ord(ptr[psize-1])
result |= (v & 0x7F) << bitpos
bitpos += 7
psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result
num = 0
size = len(data)
flags = self.extra_flags >> 1
while flags:
if flags & 1:
num += sizeof_trailing_entry(data, size - num)
flags >>= 1
return num
def decompress(self, sections): def decompress(self, sections):
r = '' r = ''
for data in sections: for data in sections:
trail_size = self.sizeof_trailing_entries(data) r += self.unpack(data)
r += self.unpack(data[:len(data)-trail_size])
if r.endswith('#'): if r.endswith('#'):
r = r[:-1] r = r[:-1]
return r return r

View File

@ -89,7 +89,7 @@ class BookHeader(object):
print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage
self.codec = 'cp1252' self.codec = 'cp1252'
if ident == 'TEXTREAD' or self.length != 0xE4: if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0 self.extra_flags = 0
else: else:
self.extra_flags, = struct.unpack('>L', raw[0xF0:0xF4]) self.extra_flags, = struct.unpack('>L', raw[0xF0:0xF4])
@ -234,8 +234,33 @@ class MobiReader(object):
return opf return opf
def sizeof_trailing_entries(self, data):
def sizeof_trailing_entry(ptr, psize):
bitpos, result = 0, 0
while True:
v = ord(ptr[psize-1])
result |= (v & 0x7F) << bitpos
bitpos += 7
psize -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
return result
num = 0
size = len(data)
flags = self.book_header.extra_flags >> 1
while flags:
if flags & 1:
num += sizeof_trailing_entry(data, size - num)
flags >>= 1
return num
def text_section(self, index):
data = self.sections[index][0]
trail_size = self.sizeof_trailing_entries(data)
return data[:len(data)-trail_size]
def extract_text(self): def extract_text(self):
text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)] text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1)) processed_records = list(range(0, self.book_header.records+1))
self.mobi_html = '' self.mobi_html = ''
@ -246,7 +271,7 @@ class MobiReader(object):
self.book_header.huff_offset+self.book_header.huff_number)] self.book_header.huff_offset+self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset, processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)) self.book_header.huff_offset+self.book_header.huff_number))
huff = HuffReader(huffs, self.book_header.extra_flags) huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections) self.mobi_html = huff.decompress(text_sections)
elif self.book_header.compression_type == '\x00\x02': elif self.book_header.compression_type == '\x00\x02':