Start work on new MOBI indexing implementation

2025-06-23 15:30:45 -04:00 · 2011-07-22 18:48:48 -06:00 · 2011-07-22 18:48:48 -06:00 · 60f1f24e66
commit 60f1f24e66
parent eab57e4f82
6 changed files with 231 additions and 100 deletions
--- a/src/calibre/ebooks/mobi/output.py
+++ b/src/calibre/ebooks/mobi/output.py
@ -82,26 +82,6 @@ class MOBIOutput(OutputFormatPlugin):
        else:
            self.oeb.log.debug('Using mastheadImage supplied in manifest...')
    def dump_toc(self, toc) :
        self.log( "\n         >>> TOC contents <<<")
        self.log( "     toc.title: %s" % toc.title)
        self.log( "      toc.href: %s" % toc.href)
        for periodical in toc.nodes :
            self.log( "\tperiodical title: %s" % periodical.title)
            self.log( "\t            href: %s" % periodical.href)
            for section in periodical :
                self.log( "\t\tsection title: %s" % section.title)
                self.log( "\t\tfirst article: %s" % section.href)
                for article in section :
                    self.log( "\t\t\tarticle title: %s" % repr(article.title))
                    self.log( "\t\t\t         href: %s" % article.href)
    def dump_manifest(self) :
        self.log( "\n         >>> Manifest entries <<<")
        for href in self.oeb.manifest.hrefs :
            self.log ("\t%s" % href)
    def periodicalize_toc(self):
        from calibre.ebooks.oeb.base import TOC
        toc = self.oeb.toc
@ -156,12 +136,6 @@ class MOBIOutput(OutputFormatPlugin):
            # Fix up the periodical href to point to first section href
            toc.nodes[0].href = toc.nodes[0].nodes[0].href
            # diagnostics
            if self.opts.verbose > 3:
                self.dump_toc(toc)
                self.dump_manifest()
    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb
        from calibre.ebooks.mobi.mobiml import MobiMLizer
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -177,3 +177,23 @@ def get_trailing_data(record, extra_data_flags):
            record = record[:-sz]
    return data, record
 def encode_trailing_data(raw):
    '''
    Given some data in the bytestring raw, return a bytestring of the form
        <data><size>
    where size is a backwards encoded vwi whose value is the length of the
    entire return bytestring.
    This is the encoding used for trailing data entries at the end of text
    records. See get_trailing_data() for details.
    '''
    lsize = 1
    while True:
        encoded = encint(len(raw) + lsize, forward=False)
        if len(encoded) == lsize:
            break
        lsize += 1
    return raw + encoded
--- a/src/calibre/ebooks/mobi/writer2/init.py
+++ b/src/calibre/ebooks/mobi/writer2/init.py
@ -12,4 +12,5 @@ UNCOMPRESSED = 1
 PALMDOC = 2
 HUFFDIC = 17480
 PALM_MAX_IMAGE_SIZE = 63 * 1024
 RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@ -0,0 +1,116 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from struct import pack
 from cStringIO import StringIO
 from collections import OrderedDict
 from calibre.ebooks import normalize
 from calibre.ebooks.mobi.utils import encint
 def utf8_text(text):
    '''
    Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
    empty, normalized bytestring.
    '''
    if text and text.strip():
        text = text.strip()
        if not isinstance(text, unicode):
            text = text.decode('utf-8', 'replace')
        text = normalize(text).encode('utf-8')
    else:
        text = _('Unknown').encode('utf-8')
    return text
 def align_block(raw, multiple=4, pad=b'\0'):
    '''
    Return raw with enough pad bytes append to ensure its length is a multiple
    of 4.
    '''
    extra = len(raw) % multiple
    if extra == 0: return raw
    return raw + pad*(multiple - extra)
 class CNCX(object): # {{{
    '''
    Create the CNCX records. These are records containing all the strings from
    the NCX. Each record is of the form: <vwi string size><utf-8 encoded
    string>
    '''
    MAX_STRING_LENGTH = 500
    def __init__(self, toc, opts):
        self.strings = OrderedDict()
        for item in toc:
            if item is self.toc: continue
            label = item.title
            klass = item.klass
            if opts.mobi_periodical:
                if item.description:
                    self.strings[item.description] = 0
                if item.author:
                    self.string[item.author] = 0
            self.strings[label] = self.strings[klass] = 0
        self.records = []
        offset = 0
        buf = StringIO()
        for key in tuple(self.strings.iterkeys()):
            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
            l = len(utf8)
            sz_bytes = encint(l)
            raw = sz_bytes + utf8
            if 0xfbf8 - buf.tell() < 6 + len(raw):
                # Records in PDB files cannot be larger than 0x10000, so we
                # stop well before that.
                pad = 0xfbf8 - self._ctoc.tell()
                buf.write(b'\0' * pad)
                self.records.append(buf.getvalue())
                buf.truncate(0)
                offset = len(self.records) * 0x10000
            self.strings[key] = offset
            offset += len(raw)
        buf.write(b'\0') # CNCX must end with zero byte
        self.records.append(align_block(buf.getvalue()))
    def __getitem__(self, string):
        return self.strings[string]
 # }}}
 class Indexer(object):
    def __init__(self, serializer, number_of_text_records, opts, oeb):
        self.serializer = serializer
        self.number_of_text_records = number_of_text_records
        self.oeb = oeb
        self.log = oeb.log
        self.opts = opts
        self.cncx = CNCX(oeb.toc, opts)
        self.records = []
    def create_header(self):
        buf = StringIO()
        # Ident
        buf.write(b'INDX')
        # Header length
        buf.write(pack(b'>I', 192))
        # Index type: 0 - normal, 2 - inflection
        buf.write(pack(b'>I', 2))
--- a/src/calibre/ebooks/mobi/writer2/main.py
+++ b/src/calibre/ebooks/mobi/writer2/main.py
@ -17,8 +17,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
 from calibre.ebooks.compression.palmdoc import compress_doc
 from calibre.ebooks.mobi.langcodes import iana2mobi
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
+from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
-from calibre.ebooks.mobi.utils import (rescale_image, encint)
+from calibre.ebooks.mobi.utils import (rescale_image, encint,
        encode_trailing_data)
 EXTH_CODES = {
    'creator': 100,
@ -39,9 +40,6 @@ EXTH_CODES = {
 # Disabled as I dont care about uncrossable breaks
 WRITE_UNCROSSABLE_BREAKS = False
 RECORD_SIZE = 0x1000 # 4096
 MAX_THUMB_SIZE = 16 * 1024
 MAX_THUMB_DIMEN = (180, 240)
@ -53,6 +51,7 @@ class MobiWriter(object):
        self.write_page_breaks_after_item = write_page_breaks_after_item
        self.compression = UNCOMPRESSED if opts.dont_compress else PALMDOC
        self.prefer_author_sort = opts.prefer_author_sort
        self.last_text_record_idx = 1
    def __call__(self, oeb, path_or_stream):
        if hasattr(path_or_stream, 'write'):
@ -79,9 +78,44 @@ class MobiWriter(object):
    def generate_content(self):
        self.map_image_names()
        self.generate_text()
-        # Image records come after text records
+        # Index records come after text records
        self.generate_index()
        self.write_uncrossable_breaks()
        # Image records come after index records
        self.generate_images()
    # Indexing {{{
    def generate_index(self):
        self.primary_index_record_idx = None
    # }}}
    def write_uncrossable_breaks(self): # {{{
        '''
        Write information about uncrossable breaks (non linear items in
        the spine.
        '''
        if not WRITE_UNCROSSABLE_BREAKS:
            return
        breaks = self.serializer.breaks
        for i in xrange(1, self.last_text_record_idx+1):
            offset = i * RECORD_SIZE
            pbreak = 0
            running = offset
            buf = StringIO()
            while breaks and (breaks[0] - offset) < RECORD_SIZE:
                pbreak = (breaks.pop(0) - running) >> 3
                encoded = encint(pbreak)
                buf.write(encoded)
                running += pbreak << 3
            encoded = encode_trailing_data(buf.getvalue())
            self.records[i] += encoded
    # }}}
    # Images {{{
    def map_image_names(self):
        '''
        Map image names to record indices, ensuring that the masthead image if
@ -120,23 +154,38 @@ class MobiWriter(object):
            if self.first_image_record is None:
                self.first_image_record = len(self.records) - 1
    def add_thumbnail(self, item):
        try:
            data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
                    maxsizeb=MAX_THUMB_SIZE)
        except IOError:
            self.oeb.logger.warn('Bad image file %r' % item.href)
            return None
        manifest = self.oeb.manifest
        id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
        manifest.add(id, href, 'image/jpeg', data=data)
        index = len(self.images) + 1
        self.images[href] = index
        self.records.append(data)
        return index
    # }}}
    # Text {{{
    def generate_text(self):
        self.oeb.logger.info('Serializing markup content...')
-        serializer = Serializer(self.oeb, self.images,
+        self.serializer = Serializer(self.oeb, self.images,
                write_page_breaks_after_item=self.write_page_breaks_after_item)
-        text = serializer()
+        text = self.serializer()
        breaks = serializer.breaks
        self.anchor_offset_kindle = serializer.anchor_offset_kindle
        self.id_offsets = serializer.id_offsets
        self.content_length = len(text)
        self.text_length = len(text)
        text = StringIO(text)
        buf = []
        nrecords = 0
        offset = 0
        if self.compression != UNCOMPRESSED:
            self.oeb.logger.info('  Compressing markup content...')
        data, overlap = self.read_text_record(text)
        while len(data) > 0:
@ -146,39 +195,15 @@ class MobiWriter(object):
            record.write(data)
            self.records.append(record.getvalue())
            buf.append(self.records[-1])
            nrecords += 1
            offset += RECORD_SIZE
            data, overlap = self.read_text_record(text)
            # Write information about the mutibyte character overlap, if any
            record.write(overlap)
            record.write(pack(b'>B', len(overlap)))
            # Write information about uncrossable breaks (non linear items in
            # the spine)
            if WRITE_UNCROSSABLE_BREAKS:
                nextra = 0
                pbreak = 0
                running = offset
-                # Write information about every uncrossable break that occurs in
+        self.last_text_record_idx = nrecords
                # the next record.
                while breaks and (breaks[0] - offset) < RECORD_SIZE:
                    pbreak = (breaks.pop(0) - running) >> 3
                    encoded = encint(pbreak)
                    record.write(encoded)
                    running += pbreak << 3
                    nextra += len(encoded)
                lsize = 1
                while True:
                    size = encint(nextra + lsize, forward=False)
                    if len(size) == lsize:
                        break
                    lsize += 1
                record.write(size)
        self.text_nrecords = nrecords + 1
    def read_text_record(self, text):
        '''
@ -230,25 +255,31 @@ class MobiWriter(object):
        return data, overlap
-    def generate_end_records(self):
+    # }}}
        self.flis_number = len(self.records)
        self.records.append('\xE9\x8E\x0D\x0A')
-    def generate_record0(self): # {{{
+    def generate_record0(self): #  MOBI header {{{
        metadata = self.oeb.metadata
        exth = self.build_exth()
        last_content_record = len(self.records) - 1
        # EOF record
        self.records.append('\xE9\x8E\x0D\x0A')
        self.generate_end_records()
        record0 = StringIO()
-        # The PalmDOC Header
+        # The MOBI Header
-        record0.write(pack(b'>HHIHHHH', self.compression, 0,
+        record0.write(pack(b'>HHIHHHH',
-            self.text_length,
+            self.compression, # compression type # compression type
-            self.text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
+            0, # Unused
            self.text_length, # Text length
            self.last_text_record_idx, # Number of text records or last tr idx
            RECORD_SIZE, # Text record size
            0, # Unused
            0  # Unused
        )) # 0 - 15 (0x0 - 0xf)
        uid = random.randint(0, 0xffffffff)
        title = normalize(unicode(metadata.title[0])).encode('utf-8')
        # The MOBI Header
        # 0x0 - 0x3
        record0.write(b'MOBI')
@ -270,7 +301,6 @@ class MobiWriter(object):
        # 0x18 - 0x1f : Unknown
        record0.write(b'\xff' * 8)
        # 0x20 - 0x23 : Secondary index record
        record0.write(pack(b'>I', 0xffffffff))
@ -279,7 +309,7 @@ class MobiWriter(object):
        # 0x40 - 0x43 : Offset of first non-text record
        record0.write(pack(b'>I',
-            self.text_nrecords + 1))
+            self.last_text_record_idx + 1))
        # 0x44 - 0x4b : title offset, title length
        record0.write(pack(b'>II',
@ -289,7 +319,7 @@ class MobiWriter(object):
        record0.write(iana2mobi(
            str(metadata.language[0])))
-        # 0x50 - 0x57 : Unknown
+        # 0x50 - 0x57 : Input language and Output language
        record0.write(b'\0' * 8)
        # 0x58 - 0x5b : Format version
@ -348,19 +378,20 @@ class MobiWriter(object):
        # 0xe0 - 0xe3 : Extra record data
        # Extra record data flags:
-        #   - 0x1: <extra multibyte bytes><size> (?)
+        #   - 0b1  : <extra multibyte bytes><size>
-        #   - 0x2: <TBS indexing description of this HTML record><size> GR
+        #   - 0b10 : <TBS indexing description of this HTML record><size>
-        #   - 0x4: <uncrossable breaks><size>
+        #   - 0b100: <uncrossable breaks><size>
        # GR: Use 7 for indexed files, 5 for unindexed
        # Setting bit 2 (0x2) disables <guide><reference type="start"> functionality
        extra_data_flags = 0b1 # Has multibyte overlap bytes
        if self.primary_index_record_idx is not None:
            extra_data_flags |= 0b10
        if WRITE_UNCROSSABLE_BREAKS:
            extra_data_flags |= 0b100
        record0.write(pack(b'>I', extra_data_flags))
        # 0xe4 - 0xe7 : Primary index record
-        record0.write(pack(b'>I', 0xffffffff))
+        record0.write(pack(b'>I', 0xffffffff if self.primary_index_record_idx
            is None else self.primary_index_record_idx))
        record0.write(exth)
        record0.write(title)
@ -371,7 +402,7 @@ class MobiWriter(object):
        self.records[0] = record0
    # }}}
-    def build_exth(self): # {{{
+    def build_exth(self): # EXTH Header {{{
        oeb = self.oeb
        exth = StringIO()
        nrecs = 0
@ -467,22 +498,10 @@ class MobiWriter(object):
        return b''.join(exth)
    # }}}
-    def add_thumbnail(self, item):
+    def write_header(self): # PalmDB header {{{
-        try:
+        '''
-            data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN,
+        Write the PalmDB header
-                    maxsizeb=MAX_THUMB_SIZE)
+        '''
        except IOError:
            self.oeb.logger.warn('Bad image file %r' % item.href)
            return None
        manifest = self.oeb.manifest
        id, href = manifest.generate('thumbnail', 'thumbnail.jpeg')
        manifest.add(id, href, 'image/jpeg', data=data)
        index = len(self.images) + 1
        self.images[href] = index
        self.records.append(data)
        return index
    def write_header(self):
        title = ascii_filename(unicode(self.oeb.metadata.title[0]))
        title = title + (b'\0' * (32 - len(title)))
        now = int(time.time())
@ -494,6 +513,7 @@ class MobiWriter(object):
            self.write(pack(b'>I', offset), b'\0', pack(b'>I', 2*i)[1:])
            offset += len(record)
        self.write(b'\0\0')
    # }}}
    def write_content(self):
        for record in self.records:
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@ -138,7 +138,7 @@ class Serializer(object):
        buf = self.buf
        self.anchor_offset = buf.tell()
        buf.write(b'<body>')
-        self.anchor_offset_kindle = buf.tell()
+        self.body_start_offset = buf.tell()
        spine = [item for item in self.oeb.spine if item.linear]
        spine.extend([item for item in self.oeb.spine if not item.linear])
        for item in spine: