diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 67f20e691f..f35d8ac075 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -957,15 +957,17 @@ class TBSIndexing(object): # {{{ return str({bin4(k):v for k, v in extra.iteritems()}) tbs_type = 0 + is_periodical = self.doc_type in (257, 258, 259) if len(byts): - outermost_index, extra, consumed = decode_tbs(byts) + outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if + is_periodical else 3) byts = byts[consumed:] for k in extra: tbs_type |= k ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) ans.append('Outermost index: %d'%outermost_index) ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) - if self.doc_type in (257, 259): # Hierarchical periodical + if is_periodical: # Hierarchical periodical byts, a = self.interpret_periodical(tbs_type, byts, dat['geom'][0]) ans += a diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 37d2093066..16aa2a3b64 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -66,11 +66,14 @@ def encint(value, forward=True): If forward is True the bytes returned are suitable for prepending to the output buffer, otherwise they must be append to the output buffer. ''' + if value < 0: + raise ValueError('Cannot encode negative numbers as vwi') # Encode vwi byts = bytearray() while True: b = value & 0b01111111 value >>= 7 # shift value to the right by 7 bits + byts.append(b) if value == 0: break @@ -198,24 +201,31 @@ def encode_trailing_data(raw): lsize += 1 return raw + encoded -def encode_fvwi(val, flags): +def encode_fvwi(val, flags, flag_size=4): ''' - Encode the value val and the 4 bit flags flags as a fvwi. This encoding is + Encode the value val and the flag_size bits from flags as a fvwi. This encoding is used in the trailing byte sequences for indexing. Returns encoded bytestring. ''' - ans = (val << 4) | (flags & 0b1111) + ans = val << flag_size + for i in xrange(flag_size): + ans |= (flags & (1 << i)) return encint(ans) -def decode_fvwi(byts): +def decode_fvwi(byts, flag_size=4): ''' Decode encoded fvwi. Returns number, flags, consumed ''' arg, consumed = decint(bytes(byts)) - return (arg >> 4), (arg & 0b1111), consumed + val = arg >> flag_size + flags = 0 + for i in xrange(flag_size): + flags |= (arg & (1 << i)) + return val, flags, consumed -def decode_tbs(byts): + +def decode_tbs(byts, flag_size=4): ''' Trailing byte sequences for indexing consists of series of fvwi numbers. This function reads the fvwi number and its associated flags. It them uses @@ -226,10 +236,10 @@ def decode_tbs(byts): data and the number of bytes consumed. ''' byts = bytes(byts) - val, flags, consumed = decode_fvwi(byts) + val, flags, consumed = decode_fvwi(byts, flag_size=flag_size) extra = {} byts = byts[consumed:] - if flags & 0b1000: + if flags & 0b1000 and flag_size > 3: extra[0b1000] = True if flags & 0b0010: x, consumed2 = decint(byts) @@ -247,7 +257,7 @@ def decode_tbs(byts): consumed += consumed2 return val, extra, consumed -def encode_tbs(val, extra): +def encode_tbs(val, extra, flag_size=4): ''' Encode the number val and the extra data in the extra dict as an fvwi. See decode_tbs above. @@ -255,7 +265,7 @@ def encode_tbs(val, extra): flags = 0 for flag in extra: flags |= flag - ans = encode_fvwi(val, flags) + ans = encode_fvwi(val, flags, flag_size=flag_size) if 0b0010 in extra: ans += encint(extra[0b0010]) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 0f7a670cff..ece96e3a7c 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -28,13 +28,12 @@ class CNCX(object): # {{{ MAX_STRING_LENGTH = 500 - def __init__(self, toc, opts): + def __init__(self, toc, is_periodical): self.strings = OrderedDict() - for item in toc: - if item is self.toc: continue + for item in toc.iterdescendants(): self.strings[item.title] = 0 - if opts.mobi_periodical: + if is_periodical: self.strings[item.klass] = 0 self.records = [] @@ -91,6 +90,17 @@ class IndexEntry(object): # {{{ self.first_child_index = None self.last_child_index = None + def __repr__(self): + return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,' + ' parent_index=%r)')%(self.offset, self.depth, self.length, + self.index, self.parent_index) + + @dynamic_property + def size(self): + def fget(self): return self.length + def fset(self, val): self.length = val + return property(fget=fget, fset=fset, doc='Alias for length') + @classmethod def tagx_block(cls, for_periodical=True): buf = bytearray() @@ -137,7 +147,7 @@ class IndexEntry(object): # {{{ def entry_type(self): ans = 0 for tag in self.tag_nums: - ans |= (1 << self.BITMASKS[tag]) # 1 << x == 2**x + ans |= (1 << self.BITMASKS.index(tag)) # 1 << x == 2**x return ans @property @@ -152,7 +162,7 @@ class IndexEntry(object): # {{{ val = getattr(self, attr) buf.write(encint(val)) - ans = buf.get_value() + ans = buf.getvalue() return ans # }}} @@ -175,13 +185,16 @@ class TBS(object): # {{{ # The starting bytes. # The value is zero which I think indicates the periodical # index entry. The values for the various flags seem to be - # unused. If the 0b0100 is present, it means that the record + # unused. If the 0b100 is present, it means that the record # deals with section 1 (or is the final record with section # transitions). - self.type_010 = encode_tbs(0, {0b0010: 0}) - self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0}) - self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0}) - self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0}) + self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) + self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, + flag_size=3) + self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, + flag_size=3) + self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: + 0}, flag_size=3) depth_map = defaultdict(list) for x in ('starts', 'ends', 'completes'): @@ -221,12 +234,18 @@ class TBS(object): # {{{ self.type_010) elif not depth_map[1]: # has only article nodes, i.e. spanned by a section - parent_section_index = self.depth_map[2][0].parent_index + parent_section_index = depth_map[2][0].parent_index typ = (self.type_111 if parent_section_index == 1 else self.type_010) else: # has section transitions - parent_section_index = self.depth_map[2][0].parent_index + if depth_map[2]: + parent_section_index = depth_map[2][0].parent_index + typ = self.type_011 + else: + parent_section_index = depth_map[1][0].index + typ = (self.type_110 if parent_section_index == 1 else + self.type_011) buf.write(typ) @@ -243,9 +262,10 @@ class TBS(object): # {{{ if spanner is None: articles = depth_map[2] - sections = [self.section_map[a.parent_index] for a in articles] - sections.sort(key=lambda x:x.offset) - section_map = {s:[a for a in articles is a.parent_index == + sections = set([self.section_map[a.parent_index] for a in + articles]) + sections = sorted(sections, key=lambda x:x.offset) + section_map = {s:[a for a in articles if a.parent_index == s.index] for s in sections} for i, section in enumerate(sections): # All the articles in this record that belong to section @@ -257,7 +277,7 @@ class TBS(object): # {{{ try: next_sec = sections[i+1] except: - next_sec == None + next_sec = None extra = {} if num > 1: @@ -299,14 +319,14 @@ class Indexer(object): # {{{ self.log('Generating MOBI index for a %s'%('periodical' if self.is_periodical else 'book')) self.is_flat_periodical = False - if opts.mobi_periodical: + if self.is_periodical: periodical_node = iter(oeb.toc).next() sections = tuple(periodical_node) self.is_flat_periodical = len(sections) == 1 self.records = [] - self.cncx = CNCX(oeb.toc, opts) + self.cncx = CNCX(oeb.toc, self.is_periodical) if self.is_periodical: self.indices = self.create_periodical_index() @@ -405,7 +425,7 @@ class Indexer(object): # {{{ buf.write(pack(b'>I', 0)) # Filled in later # Number of index records 24-28 - buf.write(pack('b>I', len(self.records))) + buf.write(pack(b'>I', len(self.records))) # Index Encoding 28-32 buf.write(pack(b'>I', 65001)) # utf-8 @@ -457,7 +477,7 @@ class Indexer(object): # {{{ idxt_offset = buf.tell() buf.write(b'IDXT') - buf.write(header_length + len(tagx_block)) + buf.write(pack(b'>H', header_length + len(tagx_block))) buf.write(b'\0') buf.seek(20) buf.write(pack(b'>I', idxt_offset)) @@ -567,7 +587,7 @@ class Indexer(object): # {{{ for s, x in enumerate(normalized_sections): sec, normalized_articles = x try: - sec.length = normalized_sections[s+1].offset - sec.offset + sec.length = normalized_sections[s+1][0].offset - sec.offset except: sec.length = self.serializer.body_end_offset - sec.offset for i, art in enumerate(normalized_articles): @@ -583,17 +603,18 @@ class Indexer(object): # {{{ normalized_articles)) normalized_sections[i] = (sec, normalized_articles) - normalized_sections = list(filter(lambda x: x[0].size > 0 and x[1], + normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1], normalized_sections)) # Set indices i = 0 - for sec, normalized_articles in normalized_sections: + for sec, articles in normalized_sections: i += 1 sec.index = i + sec.parent_index = 0 - for sec, normalized_articles in normalized_sections: - for art in normalized_articles: + for sec, articles in normalized_sections: + for art in articles: i += 1 art.index = i art.parent_index = sec.index @@ -606,7 +627,7 @@ class Indexer(object): # {{{ for s, x in enumerate(normalized_sections): sec, articles = x try: - next_offset = normalized_sections[s+1].offset + next_offset = normalized_sections[s+1][0].offset except: next_offset = self.serializer.body_end_offset sec.length = next_offset - sec.offset @@ -622,7 +643,7 @@ class Indexer(object): # {{{ for s, x in enumerate(normalized_sections): sec, articles = x try: - next_sec = normalized_sections[s+1] + next_sec = normalized_sections[s+1][0] except: if (sec.length == 0 or sec.next_offset != self.serializer.body_end_offset): @@ -659,6 +680,7 @@ class Indexer(object): # {{{ self.tbs_map = {} found_node = False sections = [i for i in self.indices if i.depth == 1] + deepest = max(i.depth for i in self.indices) for i in xrange(self.number_of_text_records): offset = i * RECORD_SIZE next_offset = offset + RECORD_SIZE @@ -683,7 +705,7 @@ class Indexer(object): # {{{ if index.next_offset <= next_offset: # Node ends in current record data['ends'].append(index) - else: + elif index.depth == deepest: data['spans'] = index if (data['ends'] or data['completes'] or data['starts'] or data['spans'] is not None): diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 06572f48c4..a5e80cc3cd 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -55,6 +55,7 @@ class MobiWriter(object): self.last_text_record_idx = 1 def __call__(self, oeb, path_or_stream): + self.log = oeb.log if hasattr(path_or_stream, 'write'): return self.dump_stream(oeb, path_or_stream) with open(path_or_stream, 'w+b') as stream: @@ -90,6 +91,7 @@ class MobiWriter(object): self.primary_index_record_idx = None try: self.indexer = Indexer(self.serializer, self.last_text_record_idx, + len(self.records[self.last_text_record_idx]), self.opts, self.oeb) except: self.log.exception('Failed to generate MOBI index:')