...

2025-07-09 03:04:10 -04:00 · 2011-07-17 16:20:56 -06:00 · 2011-07-17 16:20:56 -06:00 · 823cacf811
commit 823cacf811
parent e313a72ec1
2 changed files with 88 additions and 94 deletions
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@ -10,6 +10,7 @@ __docformat__ = 'restructuredtext en'
 import struct, datetime, sys, os
 from calibre.utils.date import utc_tz
 from calibre.ebooks.mobi.langcodes import main_language, sub_language
+from calibre.ebooks.mobi.writer2.utils import decode_hex_number

 # PalmDB {{{
 class PalmDOCAttributes(object):
@ -382,7 +383,7 @@ class TagX(object): # {{{
                self.num_values, self.bitmask, self.bmask, self.eof)
    # }}}

-class PrimaryIndexRecord(object): # {{{
+class IndexHeader(object): # {{{

    def __init__(self, record):
        self.record = record
@ -437,9 +438,8 @@ class PrimaryIndexRecord(object): # {{{
            raise ValueError('TAGX last entry is not EOF')

        idxt0_pos = self.header_length+self.tagx_header_length
-        last_name_len, = struct.unpack(b'>B', raw[idxt0_pos])
-        count_pos = idxt0_pos+1+last_name_len
-        last_num = int(raw[idxt0_pos+1:count_pos], 16)
+        last_num, consumed = decode_hex_number(raw[idxt0_pos:])
+        count_pos = idxt0_pos + consumed
        self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2])

        if last_num != self.ncx_count - 1:
@ -457,9 +457,12 @@ class PrimaryIndexRecord(object): # {{{
    def __str__(self):
        ans = ['*'*20 + ' Index Header '+ '*'*20]
        a = ans.append
+        def u(w):
+            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
+                len(w), not bool(w.replace(b'\0', b'')) ))
+
        a('Header length: %d'%self.header_length)
-        a('Unknown1: %r (%d bytes) (All zeros: %r)'%(self.unknown1,
-            len(self.unknown1), not bool(self.unknown1.replace(b'\0', '')) ))
+        u(self.unknown1)
        a('Index Type: %s (%d)'%(self.index_type_desc, self.index_type))
        a('Offset to IDXT start: %d'%self.idxt_start)
        a('Number of index records: %d'%self.index_count)
@ -472,11 +475,9 @@ class PrimaryIndexRecord(object): # {{{
        a('LIGT start: %d'%self.ligt_start)
        a('Number of LIGT entries: %d'%self.num_of_ligt_entries)
        a('Number of CTOC blocks: %d'%self.num_of_ctoc_blocks)
-        a('Unknown2: %r (%d bytes) (All zeros: %r)'%(self.unknown2,
-            len(self.unknown2), not bool(self.unknown2.replace(b'\0', '')) ))
+        u(self.unknown2)
        a('TAGX offset: %d'%self.tagx_offset)
-        a('Unknown3: %r (%d bytes) (All zeros: %r)'%(self.unknown3,
-            len(self.unknown3), not bool(self.unknown3.replace(b'\0', '')) ))
+        u(self.unknown3)
        a('\n\n')
        a('*'*20 + ' TAGX Header (%d bytes)'%self.tagx_header_length+ '*'*20)
        a('Header length: %d'%self.tagx_header_length)
@ -488,6 +489,71 @@ class PrimaryIndexRecord(object): # {{{
        return '\n'.join(ans)
    # }}}

+class IndexEntry(object):
+
+    def __init__(self, ident, entry_type, raw):
+        self.id = ident
+        self.entry_type = entry_type
+
+class IndexRecord(object): # {{{
+
+    def __init__(self, record):
+        self.record = record
+        raw = self.record.raw
+        if raw[:4] != b'INDX':
+            raise ValueError('Invalid Primary Index Record')
+
+        u = struct.unpack
+
+        self.header_length, = u('>I', raw[4:8])
+        self.unknown1 = raw[8:12]
+        self.header_type, = u('>I', raw[12:16])
+        self.unknown2 = raw[16:20]
+        self.idxt_offset, self.idxt_count = u(b'>II', raw[20:28])
+        if self.idxt_offset < 192:
+            raise ValueError('Unknown Index record structure')
+        self.unknown3 = raw[28:36]
+        self.unknown4 = raw[36:192] # Should be 156 bytes
+
+        self.index_offsets = []
+        indices = raw[self.idxt_offset:]
+        if indices[:4] != b'IDXT':
+            raise ValueError("Invalid IDXT index table")
+        indices = indices[4:]
+        for i in range(self.idxt_count):
+            off, = u(b'>H', indices[i*2:(i+1)*2])
+            self.index_offsets.append(off-192)
+
+        indxt = raw[192:self.idxt_offset]
+        self.indices = []
+        for off in self.index_offsets:
+            index = indxt[off:]
+            ident, consumed = decode_hex_number(index)
+            index = index[consumed:]
+            entry_type = u(b'>B', index[0])
+            self.indices.append(IndexEntry(ident, entry_type, index[1:]))
+
+
+    def __str__(self):
+        ans = ['*'*20 + ' Index Record (%d bytes)'%len(self.record.raw)+ '*'*20]
+        a = ans.append
+        def u(w):
+            a('Unknown: %r (%d bytes) (All zeros: %r)'%(w,
+                len(w), not bool(w.replace(b'\0', b'')) ))
+        a('Header length: %d'%self.header_length)
+        u(self.unknown1)
+        a('Header Type: %d'%self.header_type)
+        u(self.unknown2)
+        a('IDXT Offset: %d'%self.idxt_offset)
+        a('IDXT Count: %d'%self.idxt_count)
+        u(self.unknown3)
+        u(self.unknown4)
+        a('Index offsets: %r'%self.index_offsets)
+
+        return '\n'.join(ans)
+
+# }}}
+
 class MOBIFile(object): # {{{

    def __init__(self, stream):
@ -516,10 +582,11 @@ class MOBIFile(object): # {{{

        self.mobi_header = MOBIHeader(self.records[0])

-        self.primary_index_record = None
+        self.index_header = None
        pir = self.mobi_header.primary_index_record
        if pir != 0xffffffff:
-            self.primary_index_record = PrimaryIndexRecord(self.records[pir])
+            self.index_header = IndexHeader(self.records[pir])
+            self.index_record = IndexRecord(self.records[pir+1])


    def print_header(self, f=sys.stdout):
@ -542,9 +609,12 @@ def inspect_mobi(path_or_stream):
        os.mkdir(ddir)
    with open(os.path.join(ddir, 'header.txt'), 'wb') as out:
        f.print_header(f=out)
-    if f.primary_index_record is not None:
-        with open(os.path.join(ddir, 'primary_index_record.txt'), 'wb') as out:
-            print(str(f.primary_index_record), file=out)
+    if f.index_header is not None:
+        with open(os.path.join(ddir, 'index.txt'), 'wb') as out:
+            print(str(f.index_header), file=out)
+            print('\n\n', file=out)
+            print(str(f.index_record), file=out)
+
    print ('Debug data saved to:', ddir)

 def main():
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -15,10 +15,11 @@ from calibre.ebooks import normalize
 from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
 from calibre.ebooks.mobi.writer2.serializer import Serializer
 from calibre.ebooks.compression.palmdoc import compress_doc
-from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
 from calibre.ebooks.mobi.langcodes import iana2mobi
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
+from calibre.ebooks.mobi.writer2.utils import (rescale_image, decint,
+        DECINT_FORWARD, DECINT_BACKWARD)

 EXTH_CODES = {
    'creator': 100,
@ -41,87 +42,10 @@ WRITE_UNCROSSABLE_BREAKS = False

 RECORD_SIZE = 0x1000 # 4096

-IMAGE_MAX_SIZE = 10 * 1024 * 1024
+
 MAX_THUMB_SIZE = 16 * 1024
 MAX_THUMB_DIMEN = (180, 240)

-# Almost like the one for MS LIT, but not quite.
-DECINT_FORWARD = 0
-DECINT_BACKWARD = 1
-
-def decint(value, direction):
-    '''
-    Some parts of the Mobipocket format encode data as variable-width integers.
-    These integers are represented big-endian with 7 bits per byte in bits 1-7.
-    They may be either forward-encoded, in which case only the LSB has bit 8 set,
-    or backward-encoded, in which case only the MSB has bit 8 set.
-    For example, the number 0x11111 would be represented forward-encoded as:
-
-        0x04 0x22 0x91
-
-    And backward-encoded as:
-
-        0x84 0x22 0x11
-
-    This function encodes the integer ``value`` as a variable width integer and
-    returns the bytestring corresponding to it.
-    '''
-    # Encode vwi
-    byts = bytearray()
-    while True:
-        b = value & 0x7f
-        value >>= 7
-        byts.append(b)
-        if value == 0:
-            break
-    if direction == DECINT_FORWARD:
-        byts[0] |= 0x80
-    elif direction == DECINT_BACKWARD:
-        byts[-1] |= 0x80
-    return bytes(byts)
-
-def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
-    '''
-    Convert image setting all transparent pixels to white and changing format
-    to JPEG. Ensure the resultant image has a byte size less than
-    maxsizeb.
-
-    If dimen is not None, generate a thumbnail of width=dimen, height=dimen
-
-    Returns the image as a bytestring
-    '''
-    if dimen is not None:
-        data = thumbnail(data, width=dimen, height=dimen,
-                compression_quality=90)[-1]
-    else:
-        # Replace transparent pixels with white pixels and convert to JPEG
-        data = save_cover_data_to(data, 'img.jpg', return_data=True)
-    if len(data) <= maxsizeb:
-        return data
-    orig_data = data
-    img = Image()
-    quality = 95
-
-    img.load(data)
-    while len(data) >= maxsizeb and quality >= 10:
-        quality -= 5
-        img.set_compression_quality(quality)
-        data = img.export('jpg')
-    if len(data) <= maxsizeb:
-        return data
-    orig_data = data
-
-    scale = 0.9
-    while len(data) >= maxsizeb and scale >= 0.05:
-        img = Image()
-        img.load(orig_data)
-        w, h = img.size
-        img.size = (int(scale*w), int(scale*h))
-        img.set_compression_quality(quality)
-        data = img.export('jpg')
-        scale -= 0.05
-    return data
-
 class MobiWriter(object):
    COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')