CNCX diagnostics

2025-08-30 23:00:21 -04:00 · 2009-10-02 08:42:03 -06:00 · 2009-10-02 08:42:03 -06:00 · 179e8769da
commit 179e8769da
parent afd8cf61af 407a64c7e9
2 changed files with 123 additions and 46 deletions
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -340,6 +340,9 @@ class MobiWriter(object):
        self._conforming_periodical_toc = False
        self._indexable = False
        self._ctoc = ""
        self._ctoc_records = []
        self._ctoc_offset = 0
        self._ctoc_largest = 0
        self._HTMLRecords = []
        self._tbSequence = ""
        self._MobiDoc = None
@ -887,7 +890,6 @@ class MobiWriter(object):
        tbsType = 0x00
        tbSequence = ""
        # Generate TBS for type 0x101/0x103 - structured periodical
        if self._initialIndexRecordFound == False :
            # Is there any indexed content yet?
@ -1001,7 +1003,7 @@ class MobiWriter(object):
                    # Assemble arg3: Upper nybble: ending section index
                    #                Lower nybble = flags for next section - 0 or 1
-                    arg3 = (self._HTMLRecords[nrecords].continuingNodeParent + 1) << 4
+                    arg3 = (self._HTMLRecords[nrecords].continuingNodeParent + 1) << 4                    
                    arg3Flags = 0               # 0: has nodes?
                    arg3 |= arg3Flags
                    tbSequence += decint(arg3, DECINT_FORWARD)
@ -1011,7 +1013,7 @@ class MobiWriter(object):
                    #                              flag: 4 = starting nodes from previous section
                    sectionBase = self._HTMLRecords[nrecords].continuingNodeParent
-                    sectionDelta = self._sectionCount - sectionBase - 1
+                    sectionDelta = self._sectionCount - sectionBase - 1                       
                    articleOffset = self._HTMLRecords[nrecords].continuingNode + 1
                    arg4 = (sectionDelta + articleOffset) << 4
@ -1041,7 +1043,7 @@ class MobiWriter(object):
                    # Write first article of new section
                    #arg6 = self._sectionCount - 1                   # We're now into the following section
                    #arg6 = self._HTMLRecords[nrecords].nextSectionNumber
-                    arg6 = sectionDelta + self._HTMLRecords[nrecords].nextSectionOpeningNode
+                    arg6 = sectionDelta + self._HTMLRecords[nrecords].nextSectionOpeningNode              
                    arg6 <<= 4
                    if self._HTMLRecords[nrecords].nextSectionNodeCount > 1 :
                        arg6Flags = 4
@ -1238,7 +1240,10 @@ class MobiWriter(object):
            self._conforming_periodical_toc = self._evaluate_periodical_toc()
        # This routine decides whether to build flat or structured based on self._conforming_periodical_toc
-        self._ctoc = self._generate_ctoc()
+        # self._ctoc = self._generate_ctoc()
        # There may be multiple CNCX records built below, but the last record is returned and should be stored
        self._ctoc_records.append(self._generate_ctoc())
        # Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop
        toc = self._oeb.toc
@ -1395,8 +1400,10 @@ class MobiWriter(object):
        if btype < 0x100 :
            record0.write(pack('>I', 0xffffffff))
        elif btype > 0x100 and self._indexable :
-            record0.write(pack('>I', 0xffffffff if self._primary_index_record is
+            if self._primary_index_record is None: 
-                None else self._primary_index_record+3))
+                record0.write(pack('>I', 0xffffffff))
            else:
                record0.write(pack('>I', self._primary_index_record + 2 + len(self._ctoc_records)))
        else :
            record0.write(pack('>I', 0xffffffff))
@ -1613,8 +1620,7 @@ class MobiWriter(object):
        self._primary_index_record = None
 		# Build the NCXEntries and INDX
-        indxt, indxt_count, indices, last_name = \
+        indxt, indxt_count, indices, last_name = self._generate_indxt()
                self._generate_indxt(self._ctoc)
        if last_name is None:
            self._oeb.log.warn('Input document has no TOC. No index generated.')
@ -1723,7 +1729,13 @@ class MobiWriter(object):
        indx0 = indx0.getvalue()
        self._primary_index_record = len(self._records)
-        self._records.extend([indx0, indx1, self._ctoc])
+        
        # GR: handle multiple ctoc records
        # self._records.extend([indx0, indx1, self._ctoc])
        self._records.extend([indx0, indx1 ])
        for (i,ctoc_record) in enumerate(self._ctoc_records):
            self._records.append(ctoc_record)
            # print "adding %d of %d ctoc records" % (i+1, len(self._ctoc_records))
        # Indexing for author/description fields in summary section
        # Test for indexed periodical - only one that needs secondary index
@ -1786,7 +1798,29 @@ class MobiWriter(object):
        else :
            text = "(none)".encode('utf-8')
        return text
-
+        
    def _add_to_ctoc(self, ctoc_str, record_offset):
        # Write vwilen + string to ctoc
        # Return offset
        # Is there enough room for this string in the current ctoc record?
        if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str):
            # flush this ctoc, start a new one
            print "closing ctoc_record at 0x%X" % self._ctoc.tell()
            print "starting new ctoc with '%-50.50s ...'" % ctoc_str
            # pad with 00 
            pad = 0xfbf8 - self._ctoc.tell() 
            print "padding %d bytes of 00" % pad
            self._ctoc.write('\0' * (pad))
            self._ctoc_records.append(self._ctoc.getvalue())
            self._ctoc.truncate(0)
            self._ctoc_offset += 0x10000
            record_offset = self._ctoc_offset
        offset = self._ctoc.tell() + record_offset
        self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str)
        return offset
    def _add_flat_ctoc_node(self, node, ctoc, title=None):
        # Process 'chapter' or 'article' nodes only, force either to 'chapter'
        t = node.title if title is None else title
@ -1803,8 +1837,9 @@ class MobiWriter(object):
            ctoc_name_map['klass'] = node.klass
        # Add title offset to name map
-        ctoc_name_map['titleOffset'] = ctoc.tell()
+#         ctoc_name_map['titleOffset'] = ctoc.tell()
-        ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+#         ctoc.write(decint(len(t), DECINT_FORWARD)+t)
        ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
        self._chapterCount += 1
        # append this node's name_map to map
@ -1815,6 +1850,8 @@ class MobiWriter(object):
    def _add_structured_ctoc_node(self, node, ctoc, title=None):
        # Process 'periodical', 'section' and 'article'
        # Fetch the offset referencing the current ctoc_record
        if node.klass is None :
            return
        t = node.title if title is None else title
@ -1829,14 +1866,16 @@ class MobiWriter(object):
        if node.klass == 'chapter':
            # Add title offset to name map
-            ctoc_name_map['titleOffset'] = ctoc.tell()
+#             ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
-            ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+#             ctoc.write(decint(len(t), DECINT_FORWARD)+t)
            ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
            self._chapterCount += 1
        elif node.klass == 'periodical' :
            # Add title offset
-            ctoc_name_map['titleOffset'] = ctoc.tell()
+#             ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
-            ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+#             ctoc.write( decint(len(t), DECINT_FORWARD) + t )
            ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
            # Look for existing class entry 'periodical' in _ctoc_map
            for entry in self._ctoc_map:
@ -1847,15 +1886,18 @@ class MobiWriter(object):
                else :
                    continue
            else:
-                ctoc_name_map['classOffset'] = ctoc.tell()
+                # class names should always be in CNCX 0 - no offset
-                ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
+#                 ctoc_name_map['classOffset'] = ctoc.tell()
 #                 ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
                ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
            self._periodicalCount += 1
        elif node.klass == 'section' :
            # Add title offset
-            ctoc_name_map['titleOffset'] = ctoc.tell()
+#             ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
-            ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+#             ctoc.write(decint(len(t), DECINT_FORWARD)+t)
            ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
            # Look for existing class entry 'section' in _ctoc_map
            for entry in self._ctoc_map:
@ -1866,15 +1908,18 @@ class MobiWriter(object):
                else :
                    continue
            else:
-                ctoc_name_map['classOffset'] = ctoc.tell()
+                # class names should always be in CNCX 0 - no offset
-                ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
+#                 ctoc_name_map['classOffset'] = ctoc.tell()
 #                 ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
                ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
            self._sectionCount += 1
        elif node.klass == 'article' :
            # Add title offset/title
-            ctoc_name_map['titleOffset'] = ctoc.tell()
+#             ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
-            ctoc.write(decint(len(t), DECINT_FORWARD)+t)
+#             ctoc.write(decint(len(t), DECINT_FORWARD)+t)
            ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
            # Look for existing class entry 'article' in _ctoc_map
            for entry in self._ctoc_map:
@ -1884,22 +1929,26 @@ class MobiWriter(object):
                else :
                    continue
            else:
-                ctoc_name_map['classOffset'] = ctoc.tell()
+                # class names should always be in CNCX 0 - no offset
-                ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
+#                 ctoc_name_map['classOffset'] = ctoc.tell()
 #                 ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
                ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
            # Add description offset/description
            if node.description :
                d = self._clean_text_value(node.description)
-                ctoc_name_map['descriptionOffset'] = ctoc.tell()
+#                 ctoc_name_map['descriptionOffset'] = ctoc.tell() + ctoc_offset
-                ctoc.write(decint(len(d), DECINT_FORWARD)+d)
+#                 ctoc.write(decint(len(d), DECINT_FORWARD)+d)
                ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset)
            else :
                ctoc_name_map['descriptionOffset'] = None
-            # Add author offset/description
+            # Add author offset/attribution
            if node.author :
                a = self._clean_text_value(node.author)
-                ctoc_name_map['authorOffset'] = ctoc.tell()
+#                 ctoc_name_map['authorOffset'] = ctoc.tell() + ctoc_offset
-                ctoc.write(decint(len(a), DECINT_FORWARD)+a)
+#                 ctoc.write(decint(len(a), DECINT_FORWARD)+a)
                ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset)
            else :
                ctoc_name_map['authorOffset'] = None
@ -1909,10 +1958,11 @@ class MobiWriter(object):
            raise NotImplementedError( \
            'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \
            (node.title, node.klass, node.play_order))
-
+            
        # append this node's name_map to map
        self._ctoc_map.append(ctoc_name_map)
    def _generate_ctoc(self):
        # Generate the compiled TOC strings
        # Each node has 1-4 CTOC entries:
@ -1931,7 +1981,8 @@ class MobiWriter(object):
        reduced_toc = []
        self._ctoc_map = []				# per node dictionary of {class/title/desc/author} offsets
        self._last_toc_entry = None
-        ctoc = StringIO()
+        #ctoc = StringIO()
        self._ctoc = StringIO()
        # Track the individual node types
        self._periodicalCount = 0
@ -1946,8 +1997,9 @@ class MobiWriter(object):
            for (child) in toc.iter():
                if self.opts.verbose > 2 :
                    self._oeb.logger.info("  %s" % child)
-                self._add_structured_ctoc_node(child, ctoc)
+                self._add_structured_ctoc_node(child, self._ctoc)
                first = False
        else :
            self._oeb.logger.info('Generating flat CTOC ...')
            previousOffset = -1
@ -1979,7 +2031,7 @@ class MobiWriter(object):
                    # print "_generate_ctoc: child offset: 0x%X" % currentOffset
                    if currentOffset != previousOffset :
-                        self._add_flat_ctoc_node(child, ctoc)
+                        self._add_flat_ctoc_node(child, self._ctoc)
                        reduced_toc.append(child)
                        previousOffset = currentOffset
                    else :
@ -2021,8 +2073,13 @@ class MobiWriter(object):
                                    (self._periodicalCount, self._sectionCount, self._articleCount) )
            else :
                self._oeb.logger.info("chapterCount: %d" % self._chapterCount)
        if True:
            rec_count = len(self._ctoc_records)
            self._oeb.logger.info("  CNCX utilization: %d %s %.0f%% full" % \
                (rec_count + 1, 'records, last record' if rec_count else 'record,', len(self._ctoc.getvalue())/655) )
-        return align_block(ctoc.getvalue())
+        return align_block(self._ctoc.getvalue())
    def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) :
        pos = 0xc0 + indxt.tell()
@ -2341,7 +2398,7 @@ class MobiWriter(object):
        return last_name, c
-    def _generate_indxt(self, ctoc):
+    def _generate_indxt(self):
        # Assumption: child.depth() represents nestedness of the TOC.
        # A flat document (book) has a depth of 2:
        # <navMap>					child.depth() = 2
--- a/src/calibre/web/feeds/recipes/recipe_latimes.py
+++ b/src/calibre/web/feeds/recipes/recipe_latimes.py
@ -5,7 +5,6 @@ __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 latimes.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class LATimes(BasicNewsRecipe):
@ -14,8 +13,7 @@ class LATimes(BasicNewsRecipe):
    description           = u'News from Los Angeles'
    oldest_article        = 7
    max_articles_per_feed = 100
-    language = 'en'
+    language              = 'en'
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
@ -41,19 +39,24 @@ class LATimes(BasicNewsRecipe):
                .subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
                '''
   # recursions = 1
   # match_regexps = [r'http://www.latimes.com/.*page=[2-9]']
    keep_only_tags    = [dict(name='div', attrs={'class':["story"  ,"entry"] })]
-    remove_tags       = [   dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
+
    remove_tags      = [   dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
                            dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
                            dict(name='ul', attrs={'class':["article-nav clearfix",]}),
                            dict(name='p', attrs={'class':["entry-footer",]}),
                           dict(name='ul', attrs={'class':"article-nav clearfix"}),
                            dict(name=['iframe'])
                        ]
    feeds          = [(u'News', u'http://feeds.latimes.com/latimes/news')
                      ,(u'Local','http://feeds.latimes.com/latimes/news/local')
-                      ,(u'Most Emailed','http://feeds.latimes.com/MostEmailed')
+                      ,(u'MostEmailed','http://feeds.latimes.com/MostEmailed')
-                      ,(u'California Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
+                      ,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
                      ,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/')
                      ,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
                      ,('Politics','http://feeds.latimes.com/latimes/news/politics/')
@ -62,5 +65,22 @@ class LATimes(BasicNewsRecipe):
                      ,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
                      ]
    def get_article_url(self, article):
-        return article.get('feedburner_origlink')
+        ans = article.get('feedburner_origlink').rpartition('?')[0]
        try:
            self.log('Looking for full story link in', ans)
            soup = self.index_to_soup(ans)
            x = soup.find(text="single page")
            if x is not None:
                a = x.parent
                if a and a.has_key('href'):
                    ans = 'http://www.latimes.com'+a['href']
                    self.log('Found full story link', ans)
        except:
            pass
        return ans