From c87ad6d69f324b32ba0ac375b09955b7b84617f8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Mar 2012 15:31:05 +0530 Subject: [PATCH] Refactor inspect MOBI to use the INDX reading code from mobi.reader --- src/calibre/ebooks/mobi/debug.py | 284 +++++------------------- src/calibre/ebooks/mobi/reader/index.py | 163 ++++++++------ src/calibre/ebooks/mobi/reader/mobi8.py | 1 + src/calibre/ebooks/mobi/reader/ncx.py | 59 +++-- src/calibre/ebooks/mobi/utils.py | 23 +- 5 files changed, 193 insertions(+), 337 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index b12c9d2121..35484d0b39 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -15,6 +15,8 @@ from lxml import html from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.ebooks.mobi.reader.index import (parse_index_record, + parse_tagx_section) from calibre.ebooks.mobi.utils import (decode_hex_number, decint, get_trailing_data, decode_tbs, read_font_record) from calibre.utils.magick.draw import identify_data @@ -405,14 +407,10 @@ class MOBIHeader(object): # {{{ class TagX(object): # {{{ - def __init__(self, raw): - self.tag = ord(raw[0]) - self.num_values = ord(raw[1]) - self.bitmask = ord(raw[2]) - # End of file = 1 iff last entry - # When it is 1 all others are 0 - self.eof = ord(raw[3]) - + def __init__(self, tag, num_values, bitmask, eof): + self.tag, self.num_values, self.bitmask, self.eof = (tag, num_values, + bitmask, eof) + self.num_of_values = num_values self.is_eof = (self.eof == 1 and self.tag == 0 and self.num_values == 0 and self.bitmask == 0) @@ -459,13 +457,7 @@ class SecondaryIndexHeader(object): # {{{ raise ValueError('Invalid TAGX section') self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) - tag_table = tagx[12:self.tagx_header_length] - if len(tag_table) % 4 != 0: - raise ValueError('Invalid Tag table') - num_tagx_entries = len(tag_table) // 4 - self.tagx_entries = [] - for i in range(num_tagx_entries): - self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4])) + self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') @@ -533,7 +525,8 @@ class IndexHeader(object): # {{{ raise ValueError('Invalid Primary Index Record') self.header_length, = struct.unpack('>I', raw[4:8]) - self.unknown1 = raw[8:16] + self.unknown1 = raw[8:12] + self.header_type, = struct.unpack('>I', raw[12:16]) self.index_type, = struct.unpack('>I', raw[16:20]) self.index_type_desc = {0: 'normal', 2: 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') @@ -562,13 +555,7 @@ class IndexHeader(object): # {{{ raise ValueError('Invalid TAGX section') self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) - tag_table = tagx[12:self.tagx_header_length] - if len(tag_table) % 4 != 0: - raise ValueError('Invalid Tag table') - num_tagx_entries = len(tag_table) // 4 - self.tagx_entries = [] - for i in range(num_tagx_entries): - self.tagx_entries.append(TagX(tag_table[i*4:(i+1)*4])) + self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') @@ -602,6 +589,7 @@ class IndexHeader(object): # {{{ a('Header length: %d'%self.header_length) u(self.unknown1) + a('Header type: %d'%self.header_type) a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type)) a('Offset to IDXT start: %d'%self.idxt_start) a('Number of index records: %d'%self.index_count) @@ -661,19 +649,15 @@ class Tag(object): # {{{ } - def __init__(self, tagx, vals, entry_type, cncx): + def __init__(self, tag_type, vals, cncx): self.value = vals if len(vals) > 1 else vals[0] if vals else None - self.entry_type = entry_type - tag_type = tagx.tag self.cncx_value = None if tag_type in self.TAG_MAP: self.attr, self.desc = self.TAG_MAP[tag_type] else: - print ('Unknown tag value: %d in entry type: %s'%(tag_type, - entry_type)) - self.desc = '??Unknown (tag value: %d type: %s)'%( - tag_type, entry_type) + print ('Unknown tag value: %%s'%tag_type) + self.desc = '??Unknown (tag value: %d)'%tag_type self.attr = 'unknown' if '_offset' in self.attr: @@ -695,50 +679,13 @@ class IndexEntry(object): # {{{ used in the navigation UI. ''' - def __init__(self, ident, entry_type, raw, cncx, tagx_entries, - control_byte_count): - self.index = ident - self.raw = raw - self.tags = [] - self.entry_type = entry_type - self.byte_size = len(raw) - - orig_raw = raw - - if control_byte_count not in (1, 2): - raise ValueError('Unknown control byte count: %d'% - control_byte_count) - - self.flags = 0 - - if control_byte_count == 2: - self.flags = ord(raw[0]) - raw = raw[1:] - - expected_tags = [tag for tag in tagx_entries if tag.bitmask & - entry_type] - - flags = self.flags - for tag in expected_tags: - vals = [] - - if tag.tag > 0b1000000: # 0b1000000 = 64 - has_tag = flags & 0b1 - flags = flags >> 1 - if not has_tag: continue - for i in range(tag.num_values): - if not raw: - raise ValueError('Index entry does not match TAGX header') - val, consumed = decint(raw) - raw = raw[consumed:] - vals.append(val) - self.tags.append(Tag(tag, vals, self.entry_type, cncx)) - - self.consumed = len(orig_raw) - len(raw) - self.trailing_bytes = raw - if self.trailing_bytes.replace(b'\0', b''): - raise ValueError('%s has leftover bytes: %s'%(self, format_bytes( - self.trailing_bytes))) + def __init__(self, ident, entry, cncx): + try: + self.index = int(ident, 16) + except ValueError: + self.index = ident + self.tags = [Tag(tag_type, vals, cncx) for tag_type, vals in + entry.iteritems()] @property def label(self): @@ -797,102 +744,14 @@ class IndexEntry(object): # {{{ return [0, 0] def __str__(self): - ans = ['Index Entry(index=%s, entry_type=%s, flags=%s, ' - 'length=%d, byte_size=%d)'%( - self.index, bin(self.entry_type), bin(self.flags)[2:], - len(self.tags), self.byte_size)] + ans = ['Index Entry(index=%s, length=%d)'%( + self.index, len(self.tags))] for tag in self.tags: if tag.value is not None: ans.append('\t'+str(tag)) if self.first_child_index != -1: ans.append('\tNumber of children: %d'%(self.last_child_index - self.first_child_index + 1)) - if self.trailing_bytes: - ans.append('\tTrailing bytes: %r'%self.trailing_bytes) - return '\n'.join(ans) - -# }}} - -class SecondaryIndexRecord(object): # {{{ - - def __init__(self, record, index_header, cncx): - self.record = record - raw = self.record.raw - - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') - - u = struct.unpack - - self.header_length, = u('>I', raw[4:8]) - self.unknown1 = raw[8:12] - self.header_type, = u('>I', raw[12:16]) - self.unknown2 = raw[16:20] - self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) - if self.idxt_offset < 192: - raise ValueError('Unknown Index record structure') - self.unknown3 = raw[28:36] - self.unknown4 = raw[36:192] # Should be 156 bytes - - self.index_offsets = [] - indices = raw[self.idxt_offset:] - if indices[:4] != b'IDXT': - raise ValueError("Invalid IDXT index table") - indices = indices[4:] - for i in range(self.idxt_count): - off, = u(b'>H', indices[i*2:(i+1)*2]) - self.index_offsets.append(off-192) - rest = indices[(i+1)*2:] - if rest.replace(b'\0', ''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - indxt = raw[192:self.idxt_offset] - self.size_of_indxt_block = len(indxt) - - self.indices = [] - for i, off in enumerate(self.index_offsets): - try: - next_off = self.index_offsets[i+1] - except: - next_off = len(indxt) - num = ord(indxt[off]) - index = indxt[off+1:off+1+num] - consumed = 1 + num - entry_type = ord(indxt[off+consumed]) - pos = off+consumed+1 - idxe = IndexEntry(index, entry_type, - indxt[pos:next_off], cncx, - index_header.tagx_entries, - index_header.tagx_control_byte_count) - self.indices.append(idxe) - - rest = indxt[pos+self.indices[-1].consumed:] - if rest.replace(b'\0', b''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - - def __str__(self): - ans = ['*'*20 + ' Secondary Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] - a = ans.append - def u(w): - a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, - len(w), not bool(w.replace(b'\0', b'')) )) - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Unknown (header type? index record number? always 1?): %d'%self.header_type) - u(self.unknown2) - a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, - self.idxt_offset)) - a('IDXT Count: %d'%self.idxt_count) - u(self.unknown3) - u(self.unknown4) - a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries (%d entries):'%len(self.indices)) - for entry in self.indices: - a(str(entry)) - a('') - - return '\n'.join(ans) # }}} @@ -904,58 +763,25 @@ class IndexRecord(object): # {{{ in the trailing data of the text records. ''' - def __init__(self, record, index_header, cncx): - self.record = record + def __init__(self, records, index_header, cncx): self.alltext = None - raw = self.record.raw + table = OrderedDict() + tags = [TagX(x.tag, x.num_values, x.bitmask, x.eof) for x in + index_header.tagx_entries] + for record in records: + raw = record.raw - if raw[:4] != b'INDX': - raise ValueError('Invalid Primary Index Record') + if raw[:4] != b'INDX': + raise ValueError('Invalid Primary Index Record') - u = struct.unpack + parse_index_record(table, record.raw, + index_header.tagx_control_byte_count, tags, + index_header.index_encoding, strict=True) - self.header_length, = u('>I', raw[4:8]) - self.unknown1 = raw[8:12] - self.header_type, = u('>I', raw[12:16]) - self.unknown2 = raw[16:20] - self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28]) - if self.idxt_offset < 192: - raise ValueError('Unknown Index record structure') - self.unknown3 = raw[28:36] - self.unknown4 = raw[36:192] # Should be 156 bytes - - self.index_offsets = [] - indices = raw[self.idxt_offset:] - if indices[:4] != b'IDXT': - raise ValueError("Invalid IDXT index table") - indices = indices[4:] - for i in range(self.idxt_count): - off, = u(b'>H', indices[i*2:(i+1)*2]) - self.index_offsets.append(off-192) - rest = indices[(i+1)*2:] - if rest.replace(b'\0', ''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) - - indxt = raw[192:self.idxt_offset] - self.size_of_indxt_block = len(indxt) self.indices = [] - for i, off in enumerate(self.index_offsets): - try: - next_off = self.index_offsets[i+1] - except: - next_off = len(indxt) - index, consumed = decode_hex_number(indxt[off:]) - entry_type = ord(indxt[off+consumed]) - pos = off+consumed+1 - idxe = IndexEntry(index, entry_type, - indxt[pos:next_off], cncx, - index_header.tagx_entries, - index_header.tagx_control_byte_count) - self.indices.append(idxe) - rest = indxt[pos+self.indices[-1].consumed:] - if rest.replace(b'\0', b''): # There can be padding null bytes - raise ValueError('Extra bytes after IDXT table: %r'%rest) + for ident, entry in table.iteritems(): + self.indices.append(IndexEntry(ident, entry, cncx)) def get_parent(self, index): if index.depth < 1: @@ -965,24 +791,12 @@ class IndexRecord(object): # {{{ if p.depth != parent_depth: continue - def __str__(self): - ans = ['*'*20 + ' Index Record (%d bytes) '%len(self.record.raw)+ '*'*20] + ans = ['*'*20 + ' Index Entries (%d entries) '%len(self.indices)+ '*'*20] a = ans.append def u(w): a('Unknown: %r (%d bytes) (All zeros: %r)'%(w, len(w), not bool(w.replace(b'\0', b'')) )) - a('Header length: %d'%self.header_length) - u(self.unknown1) - a('Unknown (header type? index record number? always 1?): %d'%self.header_type) - u(self.unknown2) - a('IDXT Offset (%d block size): %d'%(self.size_of_indxt_block, - self.idxt_offset)) - a('IDXT Count: %d'%self.idxt_count) - u(self.unknown3) - u(self.unknown4) - a('Index offsets: %r'%self.index_offsets) - a('\nIndex Entries (%d entries):'%len(self.indices)) for entry in self.indices: offset = entry.offset a(str(entry)) @@ -1157,7 +971,7 @@ class TBSIndexing(object): # {{{ def get_index(self, idx): for i in self.indices: - if i.index == idx: return i + if i.index in {idx, unicode(idx)}: return i raise IndexError('Index %d not found'%idx) def __str__(self): @@ -1190,7 +1004,7 @@ class TBSIndexing(object): # {{{ if entries: ans.append('\t%s:'%typ) for x in entries: - ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' + ans.append(('\t\tIndex Entry: %s (Parent index: %s, ' 'Depth: %d, Offset: %d, Size: %d) [%s]')%( x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) def bin4(num): @@ -1287,18 +1101,18 @@ class TBSIndexing(object): # {{{ ' when reading starting section'%extra) si = self.get_index(si) ans.append('The section at the start of this record is:' - ' %d'%si.index) + ' %s'%si.index) if 0b0100 in extra: num = extra[0b0100] ans.append('The number of articles from the section %d' - ' in this record: %d'%(si.index, num)) + ' in this record: %s'%(si.index, num)) elif 0b0001 in extra: eof = extra[0b0001] if eof != 0: raise ValueError('Unknown eof value %s when reading' ' starting section. All bytes: %r'%(eof, orig)) ans.append('??This record has more than one article from ' - ' the section: %d'%si.index) + ' the section: %s'%si.index) return si, byts # }}} @@ -1362,21 +1176,23 @@ class MOBIFile(object): # {{{ pir = self.mobi_header.primary_index_record if pir != NULL_INDEX: self.index_header = IndexHeader(self.records[pir]) + numi = self.index_header.index_count self.cncx = CNCX(self.records[ - pir+2:pir+2+self.index_header.num_of_cncx_blocks], + pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks], self.index_header.index_encoding) - self.index_record = IndexRecord(self.records[pir+1], + self.index_record = IndexRecord(self.records[pir+1:pir+1+numi], self.index_header, self.cncx) self.indexing_record_nums = set(xrange(pir, - pir+2+self.index_header.num_of_cncx_blocks)) + pir+1+numi+self.index_header.num_of_cncx_blocks)) self.secondary_index_record = self.secondary_index_header = None sir = self.mobi_header.secondary_index_record if sir != NULL_INDEX: self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) + numi = self.secondary_index_header.index_count self.indexing_record_nums.add(sir) - self.secondary_index_record = SecondaryIndexRecord( - self.records[sir+1], self.secondary_index_header, self.cncx) - self.indexing_record_nums.add(sir+1) + self.secondary_index_record = IndexRecord( + self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx) + self.indexing_record_nums |= set(xrange(sir+1, sir+1+numi)) ntr = self.mobi_header.number_of_text_records diff --git a/src/calibre/ebooks/mobi/reader/index.py b/src/calibre/ebooks/mobi/reader/index.py index b292d55c13..dd85b5a5cb 100644 --- a/src/calibre/ebooks/mobi/reader/index.py +++ b/src/calibre/ebooks/mobi/reader/index.py @@ -8,9 +8,13 @@ __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' import struct -from collections import OrderedDict +from collections import OrderedDict, namedtuple -from calibre.ebooks.mobi.utils import decint, count_set_bits +from calibre.ebooks.mobi.utils import (decint, count_set_bits, + decode_string) + +TagX = namedtuple('TagX', 'tag num_of_values bitmask eof') +PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values') class InvalidFile(ValueError): pass @@ -37,9 +41,8 @@ def parse_indx_header(data): 'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx' ) num = len(words) - values = struct.unpack(b'>%dL' % num, data[4:4*(num+1)]) - header = {words[i]:values[i] for i in xrange(num)} - return header + values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)]) + return dict(zip(words, values)) class CNCX(object): # {{{ @@ -77,81 +80,116 @@ class CNCX(object): # {{{ return self.records.get(offset, default) # }}} -def parse_tag_section(data): +def parse_tagx_section(data): check_signature(data, b'TAGX') tags = [] - first_entry_offset, = struct.unpack_from(b'>L', data, 0x04) - control_byte_count, = struct.unpack_from(b'>L', data, 0x08) + first_entry_offset, = struct.unpack_from(b'>L', data, 4) + control_byte_count, = struct.unpack_from(b'>L', data, 8) - # Skip the first 12 bytes already read above. for i in xrange(12, first_entry_offset, 4): - pos = i - tags.append((ord(data[pos]), ord(data[pos+1]), ord(data[pos+2]), - ord(data[pos+3]))) + vals = list(bytearray(data[i:i+4])) + tags.append(TagX(*vals)) return control_byte_count, tags -def get_tag_map(control_byte_count, tags, data, start, end): +def get_tag_map(control_byte_count, tagx, data, strict=False): ptags = [] ans = {} - control_byte_index = 0 - data_start = start + control_byte_count + control_bytes = list(bytearray(data[:control_byte_count])) + data = data[control_byte_count:] - for tag, values_per_entry, mask, end_flag in tags: - if end_flag == 0x01: - control_byte_index += 1 + for x in tagx: + if x.eof == 0x01: + control_bytes = control_bytes[1:] continue - value = ord(data[start + control_byte_index]) & mask + value = control_bytes[0] & x.bitmask if value != 0: - if value == mask: - if count_set_bits(mask) > 1: + value_count = value_bytes = None + if value == x.bitmask: + if count_set_bits(x.bitmask) > 1: # If all bits of masked value are set and the mask has more # than one bit, a variable width value will follow after # the control bytes which defines the length of bytes (NOT # the value count!) which will contain the corresponding # variable width values. - value, consumed = decint(data[data_start:]) - data_start += consumed - ptags.append((tag, None, value, values_per_entry)) + value_bytes, consumed = decint(data) + data = data[consumed:] else: - ptags.append((tag, 1, None, values_per_entry)) + value_count = 1 else: # Shift bits to get the masked value. - while mask & 0x01 == 0: - mask = mask >> 1 - value = value >> 1 - ptags.append((tag, value, None, values_per_entry)) - for tag, value_count, value_bytes, values_per_entry in ptags: + mask = x.bitmask + while mask & 0b1 == 0: + mask >>= 1 + value >>= 1 + value_count = value + ptags.append(PTagX(x.tag, value_count, value_bytes, + x.num_of_values)) + + for x in ptags: values = [] - if value_count != None: + if x.value_count is not None: # Read value_count * values_per_entry variable width values. - for _ in xrange(value_count*values_per_entry): - byts, consumed = decint(data[data_start:]) - data_start += consumed + for _ in xrange(x.value_count * x.num_of_values): + byts, consumed = decint(data) + data = data[consumed:] values.append(byts) - else: + else: # value_bytes is not None # Convert value_bytes to variable width values. total_consumed = 0 - while total_consumed < value_bytes: + while total_consumed < x.value_bytes: # Does this work for values_per_entry != 1? - byts, consumed = decint(data[data_start:]) - data_start += consumed + byts, consumed = decint(data) + data = data[consumed:] total_consumed += consumed values.append(byts) - if total_consumed != value_bytes: - print ("Error: Should consume %s bytes, but consumed %s" % - (value_bytes, total_consumed)) - ans[tag] = values - # Test that all bytes have been processed if end is given. - if end is not None and data_start < end: - # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. - rest = data[data_start:end] - if rest.replace(b'\0', b''): - print ("Warning: There are unprocessed index bytes left: %s" % - format_bytes(rest)) + if total_consumed != x.value_bytes: + err = ("Error: Should consume %s bytes, but consumed %s" % + (x.value_bytes, total_consumed)) + if strict: + raise ValueError(err) + else: + print(err) + ans[x.tag] = values + # Test that all bytes have been processed + if data.replace(b'\0', b''): + err = ("Warning: There are unprocessed index bytes left: %s" % + format_bytes(data)) + if strict: + raise ValueError(err) + else: + print(err) return ans +def parse_index_record(table, data, control_byte_count, tags, codec, + strict=False): + header = parse_indx_header(data) + idxt_pos = header['start'] + if data[idxt_pos:idxt_pos+4] != b'IDXT': + print ('WARNING: Invalid INDX record') + entry_count = header['count'] + + # loop through to build up the IDXT position starts + idx_positions= [] + for j in xrange(entry_count): + pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) + idx_positions.append(pos) + # The last entry ends before the IDXT tag (but there might be zero fill + # bytes we need to ignore!) + idx_positions.append(idxt_pos) + + # For each entry in the IDXT build up the tag map and any associated + # text + for j in xrange(entry_count): + start, end = idx_positions[j:j+2] + rec = data[start:end] + ident, consumed = decode_string(rec, codec=codec) + rec = rec[consumed:] + tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict) + table[ident] = tag_map + + def read_index(sections, idx, codec): table, cncx = OrderedDict(), CNCX([], codec) @@ -166,32 +204,11 @@ def read_index(sections, idx, codec): cncx = CNCX(cncx_records, codec) tag_section_start = indx_header['len'] - control_byte_count, tags = parse_tag_section(data[tag_section_start:]) + control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) for i in xrange(idx + 1, idx + 1 + indx_count): + # Index record data = sections[i][0] - header = parse_indx_header(data) - idxt_pos = header['start'] - entry_count = header['count'] - - # loop through to build up the IDXT position starts - idx_positions= [] - for j in xrange(entry_count): - pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j)) - idx_positions.append(pos) - # The last entry ends before the IDXT tag (but there might be zero fill - # bytes we need to ignore!) - idx_positions.append(idxt_pos) - - # For each entry in the IDXT build up the tag map and any associated - # text - for j in xrange(entry_count): - start, end = idx_positions[j:j+2] - text_length = ord(data[start]) - text = data[start+1:start+1+text_length] - tag_map = get_tag_map(control_byte_count, tags, data, - start+1+text_length, end) - table[text] = tag_map - + parse_index_record(table, data, control_byte_count, tags, codec) return table, cncx diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index f5421bc9ea..7939f51ccf 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -317,6 +317,7 @@ class Mobi8Reader(object): for entry in index_entries: pos = entry['pos'] fi = self.get_file_info(pos) + #print (11111111, fi, entry['pos_fid']) if fi.filename is None: raise ValueError('Index entry has invalid pos: %d'%pos) idtag = self.get_id_tag(pos).decode(self.header.codec) diff --git a/src/calibre/ebooks/mobi/reader/ncx.py b/src/calibre/ebooks/mobi/reader/ncx.py index 96ab4ac70d..ca3255e100 100644 --- a/src/calibre/ebooks/mobi/reader/ncx.py +++ b/src/calibre/ebooks/mobi/reader/ncx.py @@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en' import os from calibre.ebooks.metadata.toc import TOC -from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.reader.headers import NULL_INDEX from calibre.ebooks.mobi.reader.index import read_index @@ -23,7 +22,30 @@ tag_fieldname_map = { 6: ['pos_fid',0], 21: ['parent',0], 22: ['child1',0], - 23: ['childn',0] + 23: ['childn',0], + 69: ['image_index',0], + 70 : ['desc_offset', 0], # 'Description offset in cncx' + 71 : ['author_offset', 0], # 'Author offset in cncx' + 72 : ['image_caption_offset', 0], # 'Image caption offset in cncx', + 73 : ['image_attr_offset', 0], # 'Image attribution offset in cncx', + +} + +default_entry = { + 'pos': -1, + 'len': 0, + 'noffs': -1, + 'text' : "Unknown Text", + 'hlvl' : -1, + 'kind' : "Unknown Class", + 'pos_fid' : None, + 'parent' : -1, + 'child1' : -1, + 'childn' : -1, + 'description': None, + 'author': None, + 'image_caption': None, + 'image_attribution': None, } def read_ncx(sections, index, codec): @@ -34,32 +56,25 @@ def read_ncx(sections, index, codec): for num, x in enumerate(table.iteritems()): text, tag_map = x - entry = { - 'name': text, - 'pos': -1, - 'len': 0, - 'noffs': -1, - 'text' : "Unknown Text", - 'hlvl' : -1, - 'kind' : "Unknown Kind", - 'pos_fid' : None, - 'parent' : -1, - 'child1' : -1, - 'childn' : -1, - 'num' : num - } + entry = default_entry.copy() + entry['name'] = text + entry['num'] = num - for tag in tag_fieldname_map.keys(): + for tag in tag_fieldname_map.iterkeys(): fieldname, i = tag_fieldname_map[tag] if tag in tag_map: fieldvalue = tag_map[tag][i] if tag == 6: - fieldvalue = to_base(fieldvalue, base=32) + # Appears to be an idx into the KF8 elems table with an + # offset + fieldvalue = tuple(tag_map[tag]) entry[fieldname] = fieldvalue - if tag == 3: - entry['text'] = cncx.get(fieldvalue, 'Unknown Text') - if tag == 5: - entry['kind'] = cncx.get(fieldvalue, 'Unknown Kind') + for which, name in {3:'text', 5:'kind', 70:'description', + 71:'author', 72:'image_caption', + 73:'image_attribution'}.iteritems(): + if tag == which: + entry[name] = cncx.get(fieldvalue, + default_entry[name]) index_entries.append(entry) return index_entries diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 6ec86f77ee..2bab82bc53 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -15,7 +15,13 @@ from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 -def decode_hex_number(raw): +def decode_string(raw, codec='utf-8'): + length, = struct.unpack(b'>B', raw[0]) + raw = raw[1:1+length] + consumed = length+1 + return raw.decode(codec), consumed + +def decode_hex_number(raw, codec='utf-8'): ''' Return a variable length number encoded using hexadecimal encoding. These numbers have the first byte which tells the number of bytes that follow. @@ -25,13 +31,16 @@ def decode_hex_number(raw): :param raw: Raw binary data as a bytestring :return: The number and the number of bytes from raw that the number - occupies + occupies. ''' - length, = struct.unpack(b'>B', raw[0]) - raw = raw[1:1+length] - consumed = length+1 + raw, consumed = decode_string(raw, codec=codec) return int(raw, 16), consumed +def encode_string(raw): + ans = bytearray(bytes(raw)) + ans.insert(0, len(ans)) + return bytes(ans) + def encode_number_as_hex(num): ''' Encode num as a variable length encoded hexadecimal number. Returns the @@ -44,9 +53,7 @@ def encode_number_as_hex(num): nlen = len(num) if nlen % 2 != 0: num = b'0'+num - ans = bytearray(num) - ans.insert(0, len(num)) - return bytes(ans) + return encode_string(num) def encint(value, forward=True): '''