New MOBI output: Start work on support for hierarchical books

2025-07-09 03:04:10 -04:00 · 2011-08-04 20:03:54 -06:00 · 2011-08-04 20:03:54 -06:00 · 7e29ea72ad
commit 7e29ea72ad
parent b203704a6b
2 changed files with 114 additions and 33 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -646,7 +646,7 @@ class Tag(object): # {{{

    INTERPRET_MAP = {
            'subchapter': {
-                    5  : ('Parent chapter index', 'parent_index')
+                    21  : ('Parent chapter index', 'parent_index')
            },

            'article'   : {
@ -702,7 +702,8 @@ class Tag(object): # {{{
                self.desc, self.attr = td[tag_type]
            except:
                print ('Unknown tag value: %d'%tag_type)
-                self.desc = '??Unknown (tag value: %d)'%tag_type
+                self.desc = '??Unknown (tag value: %d type: %s)'%(
+                        tag_type, entry_type)
                self.attr = 'unknown'
        if '_offset' in self.attr:
            self.cncx_value = cncx[self.value]
@ -750,7 +751,7 @@ class IndexEntry(object): # {{{
        try:
            self.entry_type = self.TYPES[entry_type]
        except KeyError:
-            raise ValueError('Unknown Index Entry type: %s'%hex(entry_type))
+            raise ValueError('Unknown Index Entry type: %s'%bin(entry_type))

        if control_byte_count not in (1, 2):
            raise ValueError('Unknown control byte count: %d'%
@ -1223,8 +1224,7 @@ class TBSIndexing(object): # {{{
        tbs_type = 0
        is_periodical = self.doc_type in (257, 258, 259)
        if len(byts):
-            outermost_index, extra, consumed = decode_tbs(byts, flag_size=4 if
-                    is_periodical else 3)
+            outermost_index, extra, consumed = decode_tbs(byts, flag_size=3)
            byts = byts[consumed:]
            for k in extra:
                tbs_type |= k
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -109,6 +109,20 @@ class TAGX(object): # {{{
        list(map(self.add_tag, (11, 0)))
        return self.header(1) + bytes(self.byts)

+
+
+class TAGX_BOOK(TAGX):
+    BITMASKS = dict(TAGX.BITMASKS)
+    BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 21, 22, 23])})
+
+    @property
+    def hierarchical_book(self):
+        '''
+        TAGX block for the primary index header of a hierarchical book
+        '''
+        list(map(self.add_tag, (1, 2, 3, 4, 21, 22, 23, 0)))
+        return self.header(1) + bytes(self.byts)
+
    @property
    def flat_book(self):
        '''
@ -117,6 +131,7 @@ class TAGX(object): # {{{
        list(map(self.add_tag, (1, 2, 3, 4, 0)))
        return self.header(1) + bytes(self.byts)

+
 # }}}

 # Index Entries {{{
@ -187,6 +202,9 @@ class IndexEntry(object):
            ans |= TAGX.BITMASKS[tag]
        return ans

+    def attr_for_tag(self, tag):
+        return self.RTAG_MAP[tag]
+
    @property
    def bytestring(self):
        buf = StringIO()
@ -210,7 +228,7 @@ class IndexEntry(object):
            buf.write(bytes(bytearray([flags])))

        for tag in self.tag_nums:
-            attr = self.RTAG_MAP[tag]
+            attr = self.attr_for_tag(tag)
            val = getattr(self, attr)
            if isinstance(val, int):
                val = [val]
@ -226,6 +244,17 @@ class IndexEntry(object):
        ans = buf.getvalue()
        return ans

+class BookIndexEntry(IndexEntry):
+
+    @property
+    def entry_type(self):
+        tagx = TAGX_BOOK()
+        ans = 0
+        for tag in self.tag_nums:
+            ans |= tagx.BITMASKS[tag]
+        return ans
+
+
 class PeriodicalIndexEntry(IndexEntry):

    def __init__(self, offset, label_offset, class_offset, depth):
@ -461,7 +490,6 @@ class Indexer(object): # {{{
                    if not desc: desc = _('No details available')
                    node.author, node.description = aut, desc

-
        self.cncx = CNCX(oeb.toc, self.is_periodical)

        if self.is_periodical:
@ -529,7 +557,9 @@ class Indexer(object): # {{{
            tagx_block = TAGX().secondary
        else:
            tagx_block = (TAGX().periodical if self.is_periodical else
-                            TAGX().flat_book)
+                            (TAGX_BOOK().hierarchical_book if
+                                self.book_has_subchapters else
+                                TAGX_BOOK().flat_book))
        header_length = 192

        # Ident 0 - 4
@ -615,47 +645,98 @@ class Indexer(object): # {{{
    # }}}

    def create_book_index(self): # {{{
+        self.book_has_subchapters = False
        indices = []
-        seen = set()
+        seen, sub_seen = set(), set()
        id_offsets = self.serializer.id_offsets

-        for node in self.oeb.toc.iterdescendants():
+        # Flatten toc to contain only chapters and subchapters
+        # Anything deeper than a subchapter is made into a subchapter
+        chapters = []
+        for node in self.oeb.toc:
            try:
                offset = id_offsets[node.href]
                label = self.cncx[node.title]
            except:
-                self.log.warn('TOC item %s not found in document'%node.href)
+                self.log.warn('TOC item %s [%s] not found in document'%(
+                    node.title, node.href))
                continue
+
            if offset in seen:
                continue
            seen.add(offset)
-            index = IndexEntry(offset, label)
-            indices.append(index)

-        indices.sort(key=lambda x:x.offset)
+            subchapters = []
+            chapters.append((offset, label, subchapters))

-        # Set lengths
-        for i, index in enumerate(indices):
-            try:
-                next_offset = indices[i+1].offset
-            except:
-                next_offset = self.serializer.body_end_offset
-            index.length = next_offset - index.offset
+            for descendant in node.iterdescendants():
+                try:
+                    offset = id_offsets[descendant.href]
+                    label = self.cncx[descendant.title]
+                except:
+                    self.log.warn('TOC item %s [%s] not found in document'%(
+                        descendant.title, descendant.href))
+                    continue

-        # Remove empty nodes
-        indices = [i for i in indices if i.length > 0]
+                if offset in sub_seen:
+                    continue
+                sub_seen.add(offset)
+                subchapters.append((offset, label))

-        # Set index values
-        for i, index in enumerate(indices):
-            index.index = i
+            subchapters.sort(key=lambda x:x[0])

-        # Set lengths again to close up any gaps left by filtering
-        for i, index in enumerate(indices):
-            try:
-                next_offset = indices[i+1].offset
-            except:
-                next_offset = self.serializer.body_end_offset
-            index.length = next_offset - index.offset
+        chapters.sort(key=lambda x:x[0])
+
+        chapters = [(BookIndexEntry(x[0], x[1]), [
+            BookIndexEntry(y[0], y[1]) for y in x[2]]) for x in chapters]
+
+        def set_length(indices):
+            for i, index in enumerate(indices):
+                try:
+                    next_offset = indices[i+1].offset
+                except:
+                    next_offset = self.serializer.body_end_offset
+                index.length = next_offset - index.offset
+
+        # Set chapter and subchapter lengths
+        set_length([x[0] for x in chapters])
+        for x in chapters:
+            set_length(x[1])
+
+        # Remove empty chapters
+        chapters = [x for x in chapters if x[0].length > 0]
+
+        # Remove invalid subchapters
+        for i, x in enumerate(list(chapters)):
+            chapter, subchapters = x
+            ok_subchapters = []
+            for sc in subchapters:
+                if sc.offset < chapter.next_offset and sc.length > 0:
+                    ok_subchapters.append(sc)
+            chapters[i] = (chapter, ok_subchapters)
+
+        # Reset chapter and subchapter lengths in case any were removed
+        set_length([x[0] for x in chapters])
+        for x in chapters:
+            set_length(x[1])
+
+        # Set index and depth values
+        indices = []
+        for index, x in enumerate(chapters):
+            x[0].index = index
+            indices.append(x[0])
+
+        for chapter, subchapters in chapters:
+            for sc in subchapters:
+                index += 1
+                sc.index = index
+                sc.parent_index = chapter.index
+                indices.append(sc)
+                sc.depth = 1
+                self.book_has_subchapters = True
+            if subchapters:
+                chapter.first_child_index = subchapters[0].index
+                chapter.last_child_index = subchapters[-1].index

        return indices