This commit is contained in:
Kovid Goyal 2008-02-26 23:39:28 +00:00
parent e95a7902e0
commit 04dd4e88d8
3 changed files with 10 additions and 11 deletions

View File

@ -44,8 +44,8 @@ class BitReader(object):
class HuffReader(object):
def __init__(self, huffs, extra_flags, codec='cp1252'):
self.huffs, self.extra_flags, self.codec = huffs, extra_flags, codec
def __init__(self, huffs, extra_flags):
self.huffs, self.extra_flags = huffs, extra_flags
if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header')
@ -124,4 +124,4 @@ class HuffReader(object):
r += self.unpack(data[:len(data)-trail_size])
if r.endswith('#'):
r = r[:-1]
return r.decode(self.codec)
return r

View File

@ -18,7 +18,7 @@
COUNT_BITS = 3
def decompress_doc(data, codec='cp1252'):
def decompress_doc(data):
buffer = [ord(i) for i in data]
res = []
i = 0
@ -42,5 +42,5 @@ def decompress_doc(data, codec='cp1252'):
for k in range( num ):
res.append(res[j - di+k])
return unicode(''.join([chr(i) for i in res]), codec)
return ''.join([chr(i) for i in res])

View File

@ -169,6 +169,7 @@ class MobiReader(object):
processed_records = self.extract_text()
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec)
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
@ -202,8 +203,7 @@ class MobiReader(object):
text_sections = [self.sections[i][0] for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
self.mobi_html = u''
codec = self.book_header.codec
self.mobi_html = ''
if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in
@ -211,16 +211,15 @@ class MobiReader(object):
self.book_header.huff_offset+self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number))
huff = HuffReader(huffs, self.book_header.extra_flags, codec)
huff = HuffReader(huffs, self.book_header.extra_flags)
self.mobi_html = huff.decompress(text_sections)
elif self.book_header.compression_type == '\x00\x02':
for section in text_sections:
self.mobi_html += decompress_doc(section, codec)
self.mobi_html += decompress_doc(section)
elif self.book_header.compression_type == '\x00\x01':
t = [i.decode(codec) for i in text_sections]
self.mobi_html = ''.join(t)
self.mobi_html = ''.join(text_sections)
else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))