From 40ef97e9a3f97f072d98272b0b05df7340dcedf5 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Thu, 10 Jul 2008 23:14:18 -0400 Subject: [PATCH] Lift "trailing entry" handling out of huff/cdic decompression and to general mobipocket book processing --- src/calibre/ebooks/mobi/huffcdic.py | 26 ++---------------------- src/calibre/ebooks/mobi/reader.py | 31 ++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/calibre/ebooks/mobi/huffcdic.py b/src/calibre/ebooks/mobi/huffcdic.py index 40b3356188..aa3de47925 100644 --- a/src/calibre/ebooks/mobi/huffcdic.py +++ b/src/calibre/ebooks/mobi/huffcdic.py @@ -33,7 +33,7 @@ class BitReader(object): class HuffReader(object): def __init__(self, huffs, extra_flags): - self.huffs, self.extra_flags = huffs, extra_flags + self.huffs = huffs if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18': raise MobiError('Invalid HUFF header') @@ -84,32 +84,10 @@ class HuffReader(object): self._unpack(BitReader(data)) return self.r - def sizeof_trailing_entries(self, data): - - def sizeof_trailing_entry(ptr, psize): - bitpos, result = 0, 0 - while True: - v = ord(ptr[psize-1]) - result |= (v & 0x7F) << bitpos - bitpos += 7 - psize -= 1 - if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0): - return result - - num = 0 - size = len(data) - flags = self.extra_flags >> 1 - while flags: - if flags & 1: - num += sizeof_trailing_entry(data, size - num) - flags >>= 1 - return num - def decompress(self, sections): r = '' for data in sections: - trail_size = self.sizeof_trailing_entries(data) - r += self.unpack(data[:len(data)-trail_size]) + r += self.unpack(data) if r.endswith('#'): r = r[:-1] return r diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index dea87dbd8c..31b4510510 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -89,7 +89,7 @@ class BookHeader(object): print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage self.codec = 'cp1252' - if ident == 'TEXTREAD' or self.length != 0xE4: + if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 else: self.extra_flags, = struct.unpack('>L', raw[0xF0:0xF4]) @@ -234,8 +234,33 @@ class MobiReader(object): return opf + def sizeof_trailing_entries(self, data): + def sizeof_trailing_entry(ptr, psize): + bitpos, result = 0, 0 + while True: + v = ord(ptr[psize-1]) + result |= (v & 0x7F) << bitpos + bitpos += 7 + psize -= 1 + if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0): + return result + + num = 0 + size = len(data) + flags = self.book_header.extra_flags >> 1 + while flags: + if flags & 1: + num += sizeof_trailing_entry(data, size - num) + flags >>= 1 + return num + + def text_section(self, index): + data = self.sections[index][0] + trail_size = self.sizeof_trailing_entries(data) + return data[:len(data)-trail_size] + def extract_text(self): - text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)] + text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) self.mobi_html = '' @@ -246,7 +271,7 @@ class MobiReader(object): self.book_header.huff_offset+self.book_header.huff_number)] processed_records += list(range(self.book_header.huff_offset, self.book_header.huff_offset+self.book_header.huff_number)) - huff = HuffReader(huffs, self.book_header.extra_flags) + huff = HuffReader(huffs) self.mobi_html = huff.decompress(text_sections) elif self.book_header.compression_type == '\x00\x02':