From ae6f049792bc62eb43688d0b266a3dbbff450750 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 26 Jul 2011 20:05:32 -0600
Subject: [PATCH] ...

---
 src/calibre/ebooks/mobi/debug.py           | 13 ++--
 src/calibre/ebooks/mobi/writer2/indexer.py | 76 +++++++++++++---------
 2 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py
index 6c9a2136b7..4bf8d356cd 100644
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@@ -424,12 +424,7 @@ class IndexHeader(object): # {{{
         if self.index_encoding == 'unknown':
             raise ValueError(
                 'Unknown index encoding: %d'%self.index_encoding_num)
-        self.locale_raw, = struct.unpack(b'>I', raw[32:36])
-        langcode = self.locale_raw
-        langid    = langcode & 0xFF
-        sublangid = (langcode >> 10) & 0xFF
-        self.language = main_language.get(langid, 'ENGLISH')
-        self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
+        self.possibly_language = raw[32:36]
         self.num_index_entries, = struct.unpack('>I', raw[36:40])
         self.ordt_start, = struct.unpack('>I', raw[40:44])
         self.ligt_start, = struct.unpack('>I', raw[44:48])
@@ -489,8 +484,7 @@ class IndexHeader(object): # {{{
         a('Number of index records: %d'%self.index_count)
         a('Index encoding: %s (%d)'%(self.index_encoding,
                 self.index_encoding_num))
-        a('Index language: %s - %s (%s)'%(self.language, self.sublanguage,
-            hex(self.locale_raw)))
+        a('Unknown (possibly language?): %r'%(self.possibly_language))
         a('Number of index entries: %d'% self.num_index_entries)
         a('ORDT start: %d'%self.ordt_start)
         a('LIGT start: %d'%self.ligt_start)
@@ -1038,6 +1032,7 @@ class TBSIndexing(object): # {{{
         # }}}
 
         def read_starting_section(byts): # {{{
+            orig = byts
             si, extra, consumed = decode_tbs(byts)
             byts = byts[consumed:]
             if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
@@ -1054,7 +1049,7 @@ class TBSIndexing(object): # {{{
                 eof = extra[0b0001]
                 if eof != 0:
                     raise ValueError('Unknown eof value %s when reading'
-                            ' starting section'%eof)
+                            ' starting section. All bytes: %r'%(eof, orig))
                 ans.append('This record is spanned by an article from'
                         ' the section: %d'%si.index)
             return si, byts
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index 4c428dd38d..14c5328622 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -15,7 +15,6 @@ from collections import OrderedDict, defaultdict
 from calibre.ebooks.mobi.writer2 import RECORD_SIZE
 from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
         encode_trailing_data, encode_tbs, align_block, utf8_text)
-from calibre.ebooks.mobi.langcodes import iana2mobi
 
 
 class CNCX(object): # {{{
@@ -173,28 +172,34 @@ class TBS(object): # {{{
     trailing byte sequence for the record.
     '''
 
-    def __init__(self, data, is_periodical, first=False, all_sections=[]):
-        if not data:
-            self.bytestring = encode_trailing_data(b'')
-        else:
-            self.section_map = OrderedDict((i.index, i) for i in
-                    sorted(all_sections, key=lambda x:x.offset))
+    def __init__(self, data, is_periodical, first=False, all_sections=[],
+            after_first=False):
+        self.section_map = OrderedDict((i.index, i) for i in
+                sorted(all_sections, key=lambda x:x.offset))
 
-            if is_periodical:
-                # The starting bytes.
-                # The value is zero which I think indicates the periodical
-                # index entry. The values for the various flags seem to be
-                # unused. If the 0b100 is present, it means that the record
-                # deals with section 1 (or is the final record with section
-                # transitions).
-                self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
-                self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
-                        flag_size=3)
-                self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
-                        flag_size=3)
-                self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
-                    0}, flag_size=3)
+        if is_periodical:
+            # The starting bytes.
+            # The value is zero which I think indicates the periodical
+            # index entry. The values for the various flags seem to be
+            # unused. If the 0b100 is present, it means that the record
+            # deals with section 1 (or is the final record with section
+            # transitions).
+            self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3)
+            self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0},
+                    flag_size=3)
+            self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0},
+                    flag_size=3)
+            self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001:
+                0}, flag_size=3)
 
+            if not data:
+                byts = b''
+                if after_first:
+                    # This can happen if a record contains only text between
+                    # the periodical start and the first section
+                    byts = self.type_011
+                self.bytestring = encode_trailing_data(byts)
+            else:
                 depth_map = defaultdict(list)
                 for x in ('starts', 'ends', 'completes'):
                     for idx in data[x]:
@@ -202,6 +207,9 @@ class TBS(object): # {{{
                 for l in depth_map.itervalues():
                     l.sort(key=lambda x:x.offset)
                 self.periodical_tbs(data, first, depth_map)
+        else:
+            if not data:
+                self.bytestring = encode_trailing_data(b'')
             else:
                 self.book_tbs(data, first)
 
@@ -240,15 +248,13 @@ class TBS(object): # {{{
                 # has section transitions
                 if depth_map[2]:
                     parent_section_index = depth_map[2][0].parent_index
-                    typ = self.type_011
                 else:
                     parent_section_index = depth_map[1][0].index
-                    typ = (self.type_110 if parent_section_index == 1 else
-                            self.type_011)
+                typ = self.type_011
 
         buf.write(typ)
 
-        if parent_section_index > 1:
+        if typ not in (self.type_110, self.type_111) and parent_section_index > 0:
             # Write starting section information
             if spanner is None:
                 num_articles = len(depth_map[1])
@@ -429,9 +435,8 @@ class Indexer(object): # {{{
         # Index Encoding 28-32
         buf.write(pack(b'>I', 65001)) # utf-8
 
-        # Index language 32-36
-        buf.write(iana2mobi(
-            str(self.oeb.metadata.language[0])))
+        # Unknown 32-36
+        buf.write(b'\xff'*4)
 
         # Number of index entries 36-40
         buf.write(pack(b'>I', len(self.indices)))
@@ -680,15 +685,20 @@ class Indexer(object): # {{{
         found_node = False
         sections = [i for i in self.indices if i.depth == 1]
         deepest = max(i.depth for i in self.indices)
+
         for i in xrange(self.number_of_text_records):
             offset = i * RECORD_SIZE
             next_offset = offset + RECORD_SIZE
-            data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]),
-                ('spans', None), ('offset', offset)])
+            data = {'ends':[], 'completes':[], 'starts':[],
+                    'spans':None, 'offset':offset, 'record_number':i+1}
+
             for index in self.indices:
                 if index.offset >= next_offset:
                     # Node starts after current record
-                    break
+                    if index.depth == deepest:
+                        break
+                    else:
+                        continue
                 if index.next_offset <= offset:
                     # Node ends before current record
                     continue
@@ -706,13 +716,15 @@ class Indexer(object): # {{{
                         data['ends'].append(index)
                     elif index.depth == deepest:
                         data['spans'] = index
+
             if (data['ends'] or data['completes'] or data['starts'] or
                     data['spans'] is not None):
                 self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
                         found_node, all_sections=sections)
                 found_node = True
             else:
-                self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False)
+                self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False,
+                        after_first=found_node)
 
     def get_trailing_byte_sequence(self, num):
         return self.tbs_map[num].bytestring