KF8 Output: Start work on the index layer

2025-07-09 03:04:10 -04:00 · 2012-04-20 22:39:32 +05:30 · 2012-04-20 22:39:32 +05:30 · 081897ae57
commit 081897ae57
parent 6c631e0e64
3 changed files with 132 additions and 41 deletions
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'

 import struct, string, imghdr, zlib, os
 from collections import OrderedDict
+from io import BytesIO

 from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
 from calibre.ebooks import normalize
@ -549,3 +550,48 @@ def create_text_record(text):

    return data, overlap

+class CNCX(object): # {{{
+
+    '''
+    Create the CNCX records. These are records containing all the strings from
+    an index. Each record is of the form: <vwi string size><utf-8 encoded
+    string>
+    '''
+
+    MAX_STRING_LENGTH = 500
+
+    def __init__(self, strings=()):
+        self.strings = OrderedDict((s, 0) for s in strings)
+
+        self.records = []
+        offset = 0
+        buf = BytesIO()
+        for key in tuple(self.strings.iterkeys()):
+            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
+            l = len(utf8)
+            sz_bytes = encint(l)
+            raw = sz_bytes + utf8
+            if 0xfbf8 - buf.tell() < 6 + len(raw):
+                # Records in PDB files cannot be larger than 0x10000, so we
+                # stop well before that.
+                pad = 0xfbf8 - buf.tell()
+                buf.write(b'\0' * pad)
+                self.records.append(buf.getvalue())
+                buf.truncate(0)
+                offset = len(self.records) * 0x10000
+            buf.write(raw)
+            self.strings[key] = offset
+            offset += len(raw)
+
+        self.records.append(align_block(buf.getvalue()))
+
+    def __getitem__(self, string):
+        return self.strings[string]
+
+    def __bool__(self):
+        return bool(self.records)
+    __nonzero__ = __bool__
+
+# }}}
+
+
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -13,54 +13,21 @@ from cStringIO import StringIO
 from collections import OrderedDict, defaultdict

 from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
-        encode_tbs, align_block, utf8_text, RECORD_SIZE)
+        encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)

-class CNCX(object): # {{{
-
-    '''
-    Create the CNCX records. These are records containing all the strings from
-    the NCX. Each record is of the form: <vwi string size><utf-8 encoded
-    string>
-    '''
-
-    MAX_STRING_LENGTH = 500
+class CNCX(CNCX_): # {{{

    def __init__(self, toc, is_periodical):
-        self.strings = OrderedDict()
-
+        strings = []
        for item in toc.iterdescendants(breadth_first=True):
-            self.strings[item.title] = 0
+            strings.append(item.title)
            if is_periodical:
-                self.strings[item.klass] = 0
+                strings.append(item.klass)
                if item.author:
-                    self.strings[item.author] = 0
+                    strings.append(item.author)
                if item.description:
-                    self.strings[item.description] = 0
-
-        self.records = []
-        offset = 0
-        buf = StringIO()
-        for key in tuple(self.strings.iterkeys()):
-            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
-            l = len(utf8)
-            sz_bytes = encint(l)
-            raw = sz_bytes + utf8
-            if 0xfbf8 - buf.tell() < 6 + len(raw):
-                # Records in PDB files cannot be larger than 0x10000, so we
-                # stop well before that.
-                pad = 0xfbf8 - buf.tell()
-                buf.write(b'\0' * pad)
-                self.records.append(buf.getvalue())
-                buf.truncate(0)
-                offset = len(self.records) * 0x10000
-            buf.write(raw)
-            self.strings[key] = offset
-            offset += len(raw)
-
-        self.records.append(align_block(buf.getvalue()))
-
-    def __getitem__(self, string):
-        return self.strings[string]
+                    strings.append(item.description)
+        CNCX_.__init__(self, strings)
 # }}}

 class TAGX(object): # {{{
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from future_builtins import map
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import namedtuple
+from struct import pack
+
+from calibre.ebooks.mobi.utils import CNCX
+
+TagMeta = namedtuple('TagMeta',
+        'name number values_per_entry bitmask end_flag')
+EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+
+class Index(object):
+
+    control_byte_count = 1
+    cncx = CNCX()
+    tag_types = (EndTagTable,)
+
+    @classmethod
+    def generate_tagx(cls):
+        header = b'TAGX'
+        byts = bytearray()
+        for tag_meta in cls.tag_types:
+            byts.extend(tag_meta[1:])
+        # table length, control byte count
+        header += pack(b'>II', 12+len(byts), cls.control_byte_count)
+        return header + bytes(byts)
+
+class SkelIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('chunk_count', 1, 1, 3, 0),
+        ('geometry',    6, 2, 12, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, skel_table):
+        self.entries = [
+                (s.name, {
+                    # Dont ask me why these entries have to be repeated twice
+                    'chunk_count':(s.chunk_count, s.chunk_count),
+                    'geometry':(s.start_pos, s.length, s.start_pos, s.length),
+                    }) for s in skel_table
+        ]
+
+
+class ChunkIndex(Index):
+
+    tag_types = tuple(map(TagMeta, (
+        ('cncx_offset',     2, 1, 1, 0),
+        ('file_number',     3, 1, 2, 0),
+        ('sequence_number', 4, 1, 4, 0),
+        ('geometry',        6, 2, 8, 0),
+        EndTagTable
+    )))
+
+    def __init__(self, chunk_table):
+        self.cncx = CNCX(c.selector for c in chunk_table)
+
+        self.entries = [
+                ('%010d'%c.insert_pos, {
+
+                    'cncx_offset':self.cncx[c.selector],
+                    'file_number':c.file_number,
+                    'sequence_number':c.sequence_number,
+                    'geometry':(c.start_pos, c.length),
+                    }) for s in chunk_table
+        ]
+
+
+