New MOBI output: Start work on support for hierarchical books

2025-07-09 03:04:10 -04:00 · 2011-08-04 20:03:54 -06:00 · 2011-08-04 20:03:54 -06:00 · 7e29ea72ad
commit 7e29ea72ad
parent b203704a6b
2 changed files with 114 additions and 33 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -646,7 +646,7 @@ class Tag(object): # {{{
    INTERPRET_MAP = {
            'subchapter': {
-                    5  : ('Parent chapter index', 'parent_index')
+                    21  : ('Parent chapter index', 'parent_index')
            },
            'article'   : {
@ -702,7 +702,8 @@ class Tag(object): # {{{
                self.desc, self.attr = td[tag_type]
            except:
                print ('Unknown tag value: %d'%tag_type)
-                self.desc = '??Unknown (tag value: %d)'%tag_type
+                self.desc = '??Unknown (tag value: %d type: %s)'%(
                        tag_type, entry_type)
                self.attr = 'unknown'
        if '_offset' in self.attr:
            self.cncx_value = cncx[self.value]
@ -750,7 +751,7 @@ class IndexEntry(object): # {{{
        try:
            self.entry_type = self.TYPES[entry_type]
        except KeyError:
-            raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
+            raise ValueError('Unknown Index Entry type: %s'%bin(entry_type))
        if control_byte_count not in (1, 2):
            raise ValueError('Unknown control byte count: %d'%
@ -1223,8 +1224,7 @@ class TBSIndexing(object): # {{{
        tbs_type = 0
        is_periodical = self.doc_type in (257, 258, 259)
        if len(byts):
-            outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if
+            outermost_index, extra, consumed = decode_tbs(byts, flag_size=3)
                    is_periodical else 3)
            byts = byts[consumed:]
            for k in extra:
                tbs_type |= k
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -109,6 +109,20 @@ class TAGX(object): # {{{
        list(map(self.add_tag, (11, 0)))
        return self.header(1) + bytes(self.byts)
 class TAGX_BOOK(TAGX):
    BITMASKS = dict(TAGX.BITMASKS)
    BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 21, 22, 23])})
    @property
    def hierarchical_book(self):
        '''
        TAGX block for the primary index header of a hierarchical book
        '''
        list(map(self.add_tag, (1, 2, 3, 4, 21, 22, 23, 0)))
        return self.header(1) + bytes(self.byts)
    @property
    def flat_book(self):
        '''
@ -117,6 +131,7 @@ class TAGX(object): # {{{
        list(map(self.add_tag, (1, 2, 3, 4, 0)))
        return self.header(1) + bytes(self.byts)
 # }}}
 # Index Entries {{{
@ -187,6 +202,9 @@ class IndexEntry(object):
            ans |= TAGX.BITMASKS[tag]
        return ans
    def attr_for_tag(self, tag):
        return self.RTAG_MAP[tag]
    @property
    def bytestring(self):
        buf = StringIO()
@ -210,7 +228,7 @@ class IndexEntry(object):
            buf.write(bytes(bytearray([flags])))
        for tag in self.tag_nums:
-            attr = self.RTAG_MAP[tag]
+            attr = self.attr_for_tag(tag)
            val = getattr(self, attr)
            if isinstance(val, int):
                val = [val]
@ -226,6 +244,17 @@ class IndexEntry(object):
        ans = buf.getvalue()
        return ans
 class BookIndexEntry(IndexEntry):
    @property
    def entry_type(self):
        tagx = TAGX_BOOK()
        ans = 0
        for tag in self.tag_nums:
            ans |= tagx.BITMASKS[tag]
        return ans
 class PeriodicalIndexEntry(IndexEntry):
    def __init__(self, offset, label_offset, class_offset, depth):
@ -461,7 +490,6 @@ class Indexer(object): # {{{
                    if not desc: desc = _('No details available')
                    node.author, node.description = aut, desc
        self.cncx = CNCX(oeb.toc, self.is_periodical)
        if self.is_periodical:
@ -529,7 +557,9 @@ class Indexer(object): # {{{
            tagx_block = TAGX().secondary
        else:
            tagx_block = (TAGX().periodical if self.is_periodical else
-                            TAGX().flat_book)
+                            (TAGX_BOOK().hierarchical_book if
                                self.book_has_subchapters else
                                TAGX_BOOK().flat_book))
        header_length = 192
        # Ident 0 - 4
@ -615,26 +645,52 @@ class Indexer(object): # {{{
    # }}}
    def create_book_index(self): # {{{
        self.book_has_subchapters = False
        indices = []
-        seen = set()
+        seen, sub_seen = set(), set()
        id_offsets = self.serializer.id_offsets
-        for node in self.oeb.toc.iterdescendants():
+        # Flatten toc to contain only chapters and subchapters
        # Anything deeper than a subchapter is made into a subchapter
        chapters = []
        for node in self.oeb.toc:
            try:
                offset = id_offsets[node.href]
                label = self.cncx[node.title]
            except:
-                self.log.warn('TOC item %s not found in document'%node.href)
+                self.log.warn('TOC item %s [%s] not found in document'%(
                    node.title, node.href))
                continue
            if offset in seen:
                continue
            seen.add(offset)
            index = IndexEntry(offset, label)
            indices.append(index)
-        indices.sort(key=lambda x:x.offset)
+            subchapters = []
            chapters.append((offset, label, subchapters))
-        # Set lengths
+            for descendant in node.iterdescendants():
                try:
                    offset = id_offsets[descendant.href]
                    label = self.cncx[descendant.title]
                except:
                    self.log.warn('TOC item %s [%s] not found in document'%(
                        descendant.title, descendant.href))
                    continue
                if offset in sub_seen:
                    continue
                sub_seen.add(offset)
                subchapters.append((offset, label))
            subchapters.sort(key=lambda x:x[0])
        chapters.sort(key=lambda x:x[0])
        chapters = [(BookIndexEntry(x[0], x[1]), [
            BookIndexEntry(y[0], y[1]) for y in x[2]]) for x in chapters]
        def set_length(indices):
            for i, index in enumerate(indices):
                try:
                    next_offset = indices[i+1].offset
@ -642,20 +698,45 @@ class Indexer(object): # {{{
                    next_offset = self.serializer.body_end_offset
                index.length = next_offset - index.offset
-        # Remove empty nodes
+        # Set chapter and subchapter lengths
-        indices = [i for i in indices if i.length > 0]
+        set_length([x[0] for x in chapters])
        for x in chapters:
            set_length(x[1])
-        # Set index values
+        # Remove empty chapters
-        for i, index in enumerate(indices):
+        chapters = [x for x in chapters if x[0].length > 0]
            index.index = i
-        # Set lengths again to close up any gaps left by filtering
+        # Remove invalid subchapters
-        for i, index in enumerate(indices):
+        for i, x in enumerate(list(chapters)):
-            try:
+            chapter, subchapters = x
-                next_offset = indices[i+1].offset
+            ok_subchapters = []
-            except:
+            for sc in subchapters:
-                next_offset = self.serializer.body_end_offset
+                if sc.offset < chapter.next_offset and sc.length > 0:
-            index.length = next_offset - index.offset
+                    ok_subchapters.append(sc)
            chapters[i] = (chapter, ok_subchapters)
        # Reset chapter and subchapter lengths in case any were removed
        set_length([x[0] for x in chapters])
        for x in chapters:
            set_length(x[1])
        # Set index and depth values
        indices = []
        for index, x in enumerate(chapters):
            x[0].index = index
            indices.append(x[0])
        for chapter, subchapters in chapters:
            for sc in subchapters:
                index += 1
                sc.index = index
                sc.parent_index = chapter.index
                indices.append(sc)
                sc.depth = 1
                self.book_has_subchapters = True
            if subchapters:
                chapter.first_child_index = subchapters[0].index
                chapter.last_child_index = subchapters[-1].index
        return indices