From e35ac4441e6c1af785a4a131205b476af2bc2493 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 18 Jun 2009 11:06:42 -0700
Subject: [PATCH] MOBI Indexing: Make error handling more robust

---
 src/calibre/ebooks/mobi/writer.py | 119 +++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 35 deletions(-)

diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py
index cd38ebe5d5..25ecfe7896 100644
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@@ -325,6 +325,8 @@ class MobiWriter(object):
         self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
         self._prefer_author_sort = prefer_author_sort
         self._primary_index_record = None
+        self._hasValideNCXEntries = False
+        self._ctoc = ""
         self._HTMLRecords = []
         self._tbSequence = ""
         self._initialIndexRecordFound = False
@@ -364,7 +366,7 @@ class MobiWriter(object):
         self._map_image_names()
         self._generate_text()
         #if INDEXING and not self.opts.no_mobi_index:
-        if INDEXING :
+        if INDEXING and self._hasValidNCXEntries :
             try:
                 self._generate_index()
             except:
@@ -411,6 +413,8 @@ class MobiWriter(object):
 
     def _build_HTMLRecords_Data_List(self):
         # Assemble a HTMLRecordData instance for each HTML record
+        # Return True if valid, False if invalid
+        self._oeb.logger.info('Indexing navPoints ...')
 
         numberOfHTMLRecords = ( self._content_length // RECORD_SIZE ) + 1
 
@@ -423,17 +427,28 @@ class MobiWriter(object):
         toc = self._oeb.toc
         myIndex = 0
         myEndingRecord = 0
+        previousOffset = 0
+        previousLength = 0
+        offset = 0
+        length = 0
         entries = list(toc.iter())[1:]
-        # borrowed from _generate_indxt
+
+
+        # Get offset, length per entry
         for i, child in enumerate(entries):
+
+            '''
             if not child.title or not child.title.strip():
-                child.title = _('Unnamed')
+                child.title = "(none)"
+            '''
             h = child.href
             if h not in self._id_offsets:
                 self._oeb.log.warning('Could not find TOC entry:', child.title)
                 continue
             offset = self._id_offsets[h]
+
             length = None
+
             for sibling in entries[i+1:]:
                 h2 = sibling.href
                 if h2 in self._id_offsets:
@@ -441,9 +456,25 @@ class MobiWriter(object):
                     if offset2 > offset:
                         length = offset2 - offset
                         break
+
             if length is None:
                 length = self._content_length - offset
 
+            # Look a gap between nodes
+            if (i) :
+                if offset != previousOffset + previousLength :
+                    self._oeb.log.warning("\tnodes %d and %d have a gap:" % (i-1, i))
+                    self._oeb.log.warning("\tnode %d offset: 0x%X \t node %d: offset: 0x%X length: 0x%X" % \
+                        (i, offset, i-1, previousOffset, previousLength) )
+                    self._oeb.log.warning('Failed to generate index')
+                    # Zero out self._HTMLRecords, return False
+                    self._HTMLRecords = []
+                    last_name = None
+                    return False
+
+            previousOffset = offset
+            previousLength = length
+
             # Calculate the HTML record for this entry
             myStartingRecord = offset // RECORD_SIZE
 
@@ -461,19 +492,30 @@ class MobiWriter(object):
             # Calculate the ending HTMLRecord of this entry
             myEndingRecord = (offset + length) // RECORD_SIZE
 
-            # Tell the future HTML records about us
             if myEndingRecord > myStartingRecord :
                 interimSpanRecord = myStartingRecord + 1
                 while interimSpanRecord <= myEndingRecord :
                     self._HTMLRecords[interimSpanRecord].continuingNode = myIndex
                     self._HTMLRecords[interimSpanRecord].currentSectionNodeCount = 1
                     interimSpanRecord += 1
+                if self.opts.verbose > 3 :self._oeb.logger.info("\tnode %03d %-15.15s... spans HTML records %03d - %03d \t offset: 0x%06X length: 0x%06X" % \
+                    (myIndex, child.title if child.title.strip() > "" else "(missing)", myStartingRecord, interimSpanRecord, offset, length) )
+            else :
+                if self.opts.verbose > 3 : self._oeb.logger.info("\tnode %03d %-15.15s... spans HTML records %03d - %03d \t offset: 0x%06X length: 0x%06X" % \
+                    (myIndex, child.title if child.title.strip() > "" else "(missing)", myStartingRecord, myStartingRecord, offset, length) )
 
             ctoc_offset = self._ctoc_map[child]
-            last_name = "%04d" % myIndex
+            last_name = "%04X" % myIndex
             myIndex += 1
 
+        # Successfully parsed the entries
+        return True
+
+
     def _build_TBS_Book(self, nrecords, lastrecord):
+        if self.opts.verbose > 3 and False :
+            self._oeb.logger.info("_build_TBS_Book: HTML record %d of %d" % (nrecords, lastrecord) )
+            self._HTMLRecords[nrecords].dumpData(nrecords,self._oeb)
 
         # Variables for trailing byte sequence
         tbsType = 0x00
@@ -533,7 +575,9 @@ class MobiWriter(object):
                 tbSequence += chr(self._HTMLRecords[nrecords].currentSectionNodeCount)
             tbSequence += decint(len(tbSequence) + 1, DECINT_FORWARD)
 
-        # print "record %d: tbsType %d" % (nrecords, tbsType)
+        if self.opts.verbose > 3  and False:
+            self._oeb.logger.info("record %d: tbsType %d" % (nrecords, tbsType) )
+
         self._tbSequence = tbSequence
 
 
@@ -556,16 +600,16 @@ class MobiWriter(object):
             self._oeb.logger.info('Compressing markup content...')
         data, overlap = self._read_text_record(text)
 
-        # GR borrowed this from generate_index
-        # We seem to need it before calling self._build_HTMLRecords_Data_List()
-        ctoc = self._generate_ctoc()
+        # We need entries[] before calling self._build_HTMLRecords_Data_List()
+        self._ctoc = self._generate_ctoc()
 
         # Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop
         toc = self._oeb.toc
         entries = list(toc.iter())[1:]
-        hasNCXEntries = True if len(entries) else False
-        if hasNCXEntries :
-            self._build_HTMLRecords_Data_List()
+        if len(entries) :
+            self._hasValidNCXEntries = self._build_HTMLRecords_Data_List()
+        else :
+            self._hasValidNCXEntries = False
 
         while len(data) > 0:
             if self._compression == PALMDOC:
@@ -579,7 +623,7 @@ class MobiWriter(object):
             running = offset
 
             # Write Trailing Byte Sequence
-            if INDEXING and hasNCXEntries :
+            if INDEXING and self._hasValidNCXEntries:
                 # Dispatch to different TBS generators based upon publication type
                 booktype = 0x101 if self.opts.mobi_periodical else 0x002
                 if booktype == 0x002 :
@@ -639,6 +683,7 @@ class MobiWriter(object):
         self._text_nrecords = nrecords
 
     def _generate_indxt(self, ctoc):
+
         if self.opts.mobi_periodical:
             raise NotImplementedError('Indexing for periodicals not implemented')
         toc = self._oeb.toc
@@ -655,7 +700,7 @@ class MobiWriter(object):
 
             pos = 0xc0 + indxt.tell()
             indices.write(pack('>H', pos))
-            name = "%04d"%count
+            name = "%04X"%count
             indxt.write(chr(len(name)) + name)
             indxt.write(INDXT['chapter'])
             indxt.write(decint(offset, DECINT_FORWARD))
@@ -666,8 +711,6 @@ class MobiWriter(object):
 
         entries = list(toc.iter())[1:]
         for i, child in enumerate(entries):
-            if not child.title or not child.title.strip():
-                continue
             h = child.href
             if h not in self._id_offsets:
                 self._oeb.log.warning('Could not find TOC entry:', child.title)
@@ -686,7 +729,7 @@ class MobiWriter(object):
 
             add_node(child, offset, length, c)
             ctoc_offset = self._ctoc_map[child]
-            last_name = "%04d"%c
+            last_name = "%04X"%c
             c += 1
 
         return align_block(indxt.getvalue()), c, \
@@ -696,9 +739,9 @@ class MobiWriter(object):
     def _generate_index(self):
         self._oeb.log('Generating primary index...')
         self._primary_index_record = None
-        ctoc = self._generate_ctoc()
+
         indxt, indxt_count, indices, last_name = \
-                self._generate_indxt(ctoc)
+                self._generate_indxt(self._ctoc)
         if last_name is None:
             self._oeb.log.warn('Input document has no TOC. No index generated.')
             return
@@ -801,7 +844,7 @@ class MobiWriter(object):
         indx0 = indx0.getvalue()
 
         self._primary_index_record = len(self._records)
-        self._records.extend([indx0, indx1, ctoc])
+        self._records.extend([indx0, indx1, self._ctoc])
 
         # Turn this off for now
         if False:
@@ -862,30 +905,36 @@ class MobiWriter(object):
     def _generate_ctoc(self):
         if self.opts.mobi_periodical:
             raise NotImplementedError('Indexing for periodicals not implemented')
+        self._oeb.logger.info('Generating CTOC ...')
+
         toc = self._oeb.toc
         self._ctoc_map = {}
         self._ctoc_name_map = {}
         self._last_toc_entry = None
         ctoc = StringIO()
 
-        def add_node(node, cls):
-            t = node.title
-            if not t:
-                t = _('Unnamed')
-            t = t.strip()
-            if not isinstance(t, unicode):
-                t = t.decode('utf-8', 'replace')
-            t = t.encode('utf-8')
-            self._last_toc_entry = t
-            self._ctoc_map[node] = ctoc.tell()
-            self._ctoc_name_map[node] = t
-            ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+        def add_node(node, cls, title=None):
+            t = node.title if title is None else title
+
+            if t and t.strip():
+                t = t.strip()
+                if not isinstance(t, unicode):
+                    t = t.decode('utf-8', 'replace')
+                t = t.encode('utf-8')
+                self._last_toc_entry = t
+                self._ctoc_map[node] = ctoc.tell()
+                self._ctoc_name_map[node] = t
+                ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+            else :
+                t = "(none)".encode('utf-8')
+                self._last_toc_entry = t
+                self._ctoc_map[node] = ctoc.tell()
+                self._ctoc_name_map[node] = t
+                ctoc.write(decint(len(t), DECINT_FORWARD)+t)
 
         first = True
         for child in toc.iter():
-            if not child.title:
-                child.title = _('Unnamed')
-            add_node(child, 'chapter')
+            add_node(child, 'chapter')#, title='Title Page' if first else None)
             first = False
 
         return align_block(ctoc.getvalue())