diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index e038ffa63d..ca1da86ac2 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -646,7 +646,7 @@ class Tag(object): # {{{ INTERPRET_MAP = { 'subchapter': { - 5 : ('Parent chapter index', 'parent_index') + 21 : ('Parent chapter index', 'parent_index') }, 'article' : { @@ -702,7 +702,8 @@ class Tag(object): # {{{ self.desc, self.attr = td[tag_type] except: print ('Unknown tag value: %d'%tag_type) - self.desc = '??Unknown (tag value: %d)'%tag_type + self.desc = '??Unknown (tag value: %d type: %s)'%( + tag_type, entry_type) self.attr = 'unknown' if '_offset' in self.attr: self.cncx_value = cncx[self.value] @@ -750,7 +751,7 @@ class IndexEntry(object): # {{{ try: self.entry_type = self.TYPES[entry_type] except KeyError: - raise ValueError('Unknown Index Entry type: %s'%hex(entry_type)) + raise ValueError('Unknown Index Entry type: %s'%bin(entry_type)) if control_byte_count not in (1, 2): raise ValueError('Unknown control byte count: %d'% @@ -1223,8 +1224,7 @@ class TBSIndexing(object): # {{{ tbs_type = 0 is_periodical = self.doc_type in (257, 258, 259) if len(byts): - outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if - is_periodical else 3) + outermost_index, extra, consumed = decode_tbs(byts, flag_size=3) byts = byts[consumed:] for k in extra: tbs_type |= k diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 2238601a72..6b7939dc30 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -109,6 +109,20 @@ class TAGX(object): # {{{ list(map(self.add_tag, (11, 0))) return self.header(1) + bytes(self.byts) + + +class TAGX_BOOK(TAGX): + BITMASKS = dict(TAGX.BITMASKS) + BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 21, 22, 23])}) + + @property + def hierarchical_book(self): + ''' + TAGX block for the primary index header of a hierarchical book + ''' + list(map(self.add_tag, (1, 2, 3, 4, 21, 22, 23, 0))) + return self.header(1) + bytes(self.byts) + @property def flat_book(self): ''' @@ -117,6 +131,7 @@ class TAGX(object): # {{{ list(map(self.add_tag, (1, 2, 3, 4, 0))) return self.header(1) + bytes(self.byts) + # }}} # Index Entries {{{ @@ -187,6 +202,9 @@ class IndexEntry(object): ans |= TAGX.BITMASKS[tag] return ans + def attr_for_tag(self, tag): + return self.RTAG_MAP[tag] + @property def bytestring(self): buf = StringIO() @@ -210,7 +228,7 @@ class IndexEntry(object): buf.write(bytes(bytearray([flags]))) for tag in self.tag_nums: - attr = self.RTAG_MAP[tag] + attr = self.attr_for_tag(tag) val = getattr(self, attr) if isinstance(val, int): val = [val] @@ -226,6 +244,17 @@ class IndexEntry(object): ans = buf.getvalue() return ans +class BookIndexEntry(IndexEntry): + + @property + def entry_type(self): + tagx = TAGX_BOOK() + ans = 0 + for tag in self.tag_nums: + ans |= tagx.BITMASKS[tag] + return ans + + class PeriodicalIndexEntry(IndexEntry): def __init__(self, offset, label_offset, class_offset, depth): @@ -461,7 +490,6 @@ class Indexer(object): # {{{ if not desc: desc = _('No details available') node.author, node.description = aut, desc - self.cncx = CNCX(oeb.toc, self.is_periodical) if self.is_periodical: @@ -529,7 +557,9 @@ class Indexer(object): # {{{ tagx_block = TAGX().secondary else: tagx_block = (TAGX().periodical if self.is_periodical else - TAGX().flat_book) + (TAGX_BOOK().hierarchical_book if + self.book_has_subchapters else + TAGX_BOOK().flat_book)) header_length = 192 # Ident 0 - 4 @@ -615,47 +645,98 @@ class Indexer(object): # {{{ # }}} def create_book_index(self): # {{{ + self.book_has_subchapters = False indices = [] - seen = set() + seen, sub_seen = set(), set() id_offsets = self.serializer.id_offsets - for node in self.oeb.toc.iterdescendants(): + # Flatten toc to contain only chapters and subchapters + # Anything deeper than a subchapter is made into a subchapter + chapters = [] + for node in self.oeb.toc: try: offset = id_offsets[node.href] label = self.cncx[node.title] except: - self.log.warn('TOC item %s not found in document'%node.href) + self.log.warn('TOC item %s [%s] not found in document'%( + node.title, node.href)) continue + if offset in seen: continue seen.add(offset) - index = IndexEntry(offset, label) - indices.append(index) - indices.sort(key=lambda x:x.offset) + subchapters = [] + chapters.append((offset, label, subchapters)) - # Set lengths - for i, index in enumerate(indices): - try: - next_offset = indices[i+1].offset - except: - next_offset = self.serializer.body_end_offset - index.length = next_offset - index.offset + for descendant in node.iterdescendants(): + try: + offset = id_offsets[descendant.href] + label = self.cncx[descendant.title] + except: + self.log.warn('TOC item %s [%s] not found in document'%( + descendant.title, descendant.href)) + continue - # Remove empty nodes - indices = [i for i in indices if i.length > 0] + if offset in sub_seen: + continue + sub_seen.add(offset) + subchapters.append((offset, label)) - # Set index values - for i, index in enumerate(indices): - index.index = i + subchapters.sort(key=lambda x:x[0]) - # Set lengths again to close up any gaps left by filtering - for i, index in enumerate(indices): - try: - next_offset = indices[i+1].offset - except: - next_offset = self.serializer.body_end_offset - index.length = next_offset - index.offset + chapters.sort(key=lambda x:x[0]) + + chapters = [(BookIndexEntry(x[0], x[1]), [ + BookIndexEntry(y[0], y[1]) for y in x[2]]) for x in chapters] + + def set_length(indices): + for i, index in enumerate(indices): + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + # Set chapter and subchapter lengths + set_length([x[0] for x in chapters]) + for x in chapters: + set_length(x[1]) + + # Remove empty chapters + chapters = [x for x in chapters if x[0].length > 0] + + # Remove invalid subchapters + for i, x in enumerate(list(chapters)): + chapter, subchapters = x + ok_subchapters = [] + for sc in subchapters: + if sc.offset < chapter.next_offset and sc.length > 0: + ok_subchapters.append(sc) + chapters[i] = (chapter, ok_subchapters) + + # Reset chapter and subchapter lengths in case any were removed + set_length([x[0] for x in chapters]) + for x in chapters: + set_length(x[1]) + + # Set index and depth values + indices = [] + for index, x in enumerate(chapters): + x[0].index = index + indices.append(x[0]) + + for chapter, subchapters in chapters: + for sc in subchapters: + index += 1 + sc.index = index + sc.parent_index = chapter.index + indices.append(sc) + sc.depth = 1 + self.book_has_subchapters = True + if subchapters: + chapter.first_child_index = subchapters[0].index + chapter.last_child_index = subchapters[-1].index return indices