CNCX diagnostics

This commit is contained in:
GRiker 2009-10-02 08:42:03 -06:00
commit 179e8769da
2 changed files with 123 additions and 46 deletions

View File

@ -340,6 +340,9 @@ class MobiWriter(object):
self._conforming_periodical_toc = False self._conforming_periodical_toc = False
self._indexable = False self._indexable = False
self._ctoc = "" self._ctoc = ""
self._ctoc_records = []
self._ctoc_offset = 0
self._ctoc_largest = 0
self._HTMLRecords = [] self._HTMLRecords = []
self._tbSequence = "" self._tbSequence = ""
self._MobiDoc = None self._MobiDoc = None
@ -887,7 +890,6 @@ class MobiWriter(object):
tbsType = 0x00 tbsType = 0x00
tbSequence = "" tbSequence = ""
# Generate TBS for type 0x101/0x103 - structured periodical # Generate TBS for type 0x101/0x103 - structured periodical
if self._initialIndexRecordFound == False : if self._initialIndexRecordFound == False :
# Is there any indexed content yet? # Is there any indexed content yet?
@ -1001,7 +1003,7 @@ class MobiWriter(object):
# Assemble arg3: Upper nybble: ending section index # Assemble arg3: Upper nybble: ending section index
# Lower nybble = flags for next section - 0 or 1 # Lower nybble = flags for next section - 0 or 1
arg3 = (self._HTMLRecords[nrecords].continuingNodeParent + 1) << 4 arg3 = (self._HTMLRecords[nrecords].continuingNodeParent + 1) << 4
arg3Flags = 0 # 0: has nodes? arg3Flags = 0 # 0: has nodes?
arg3 |= arg3Flags arg3 |= arg3Flags
tbSequence += decint(arg3, DECINT_FORWARD) tbSequence += decint(arg3, DECINT_FORWARD)
@ -1011,7 +1013,7 @@ class MobiWriter(object):
# flag: 4 = starting nodes from previous section # flag: 4 = starting nodes from previous section
sectionBase = self._HTMLRecords[nrecords].continuingNodeParent sectionBase = self._HTMLRecords[nrecords].continuingNodeParent
sectionDelta = self._sectionCount - sectionBase - 1 sectionDelta = self._sectionCount - sectionBase - 1
articleOffset = self._HTMLRecords[nrecords].continuingNode + 1 articleOffset = self._HTMLRecords[nrecords].continuingNode + 1
arg4 = (sectionDelta + articleOffset) << 4 arg4 = (sectionDelta + articleOffset) << 4
@ -1041,7 +1043,7 @@ class MobiWriter(object):
# Write first article of new section # Write first article of new section
#arg6 = self._sectionCount - 1 # We're now into the following section #arg6 = self._sectionCount - 1 # We're now into the following section
#arg6 = self._HTMLRecords[nrecords].nextSectionNumber #arg6 = self._HTMLRecords[nrecords].nextSectionNumber
arg6 = sectionDelta + self._HTMLRecords[nrecords].nextSectionOpeningNode arg6 = sectionDelta + self._HTMLRecords[nrecords].nextSectionOpeningNode
arg6 <<= 4 arg6 <<= 4
if self._HTMLRecords[nrecords].nextSectionNodeCount > 1 : if self._HTMLRecords[nrecords].nextSectionNodeCount > 1 :
arg6Flags = 4 arg6Flags = 4
@ -1238,7 +1240,10 @@ class MobiWriter(object):
self._conforming_periodical_toc = self._evaluate_periodical_toc() self._conforming_periodical_toc = self._evaluate_periodical_toc()
# This routine decides whether to build flat or structured based on self._conforming_periodical_toc # This routine decides whether to build flat or structured based on self._conforming_periodical_toc
self._ctoc = self._generate_ctoc() # self._ctoc = self._generate_ctoc()
# There may be multiple CNCX records built below, but the last record is returned and should be stored
self._ctoc_records.append(self._generate_ctoc())
# Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop # Build the HTMLRecords list so we can assemble the trailing bytes sequences in the following while loop
toc = self._oeb.toc toc = self._oeb.toc
@ -1395,8 +1400,10 @@ class MobiWriter(object):
if btype < 0x100 : if btype < 0x100 :
record0.write(pack('>I', 0xffffffff)) record0.write(pack('>I', 0xffffffff))
elif btype > 0x100 and self._indexable : elif btype > 0x100 and self._indexable :
record0.write(pack('>I', 0xffffffff if self._primary_index_record is if self._primary_index_record is None:
None else self._primary_index_record+3)) record0.write(pack('>I', 0xffffffff))
else:
record0.write(pack('>I', self._primary_index_record + 2 + len(self._ctoc_records)))
else : else :
record0.write(pack('>I', 0xffffffff)) record0.write(pack('>I', 0xffffffff))
@ -1613,8 +1620,7 @@ class MobiWriter(object):
self._primary_index_record = None self._primary_index_record = None
# Build the NCXEntries and INDX # Build the NCXEntries and INDX
indxt, indxt_count, indices, last_name = \ indxt, indxt_count, indices, last_name = self._generate_indxt()
self._generate_indxt(self._ctoc)
if last_name is None: if last_name is None:
self._oeb.log.warn('Input document has no TOC. No index generated.') self._oeb.log.warn('Input document has no TOC. No index generated.')
@ -1723,7 +1729,13 @@ class MobiWriter(object):
indx0 = indx0.getvalue() indx0 = indx0.getvalue()
self._primary_index_record = len(self._records) self._primary_index_record = len(self._records)
self._records.extend([indx0, indx1, self._ctoc])
# GR: handle multiple ctoc records
# self._records.extend([indx0, indx1, self._ctoc])
self._records.extend([indx0, indx1 ])
for (i,ctoc_record) in enumerate(self._ctoc_records):
self._records.append(ctoc_record)
# print "adding %d of %d ctoc records" % (i+1, len(self._ctoc_records))
# Indexing for author/description fields in summary section # Indexing for author/description fields in summary section
# Test for indexed periodical - only one that needs secondary index # Test for indexed periodical - only one that needs secondary index
@ -1786,7 +1798,29 @@ class MobiWriter(object):
else : else :
text = "(none)".encode('utf-8') text = "(none)".encode('utf-8')
return text return text
def _add_to_ctoc(self, ctoc_str, record_offset):
# Write vwilen + string to ctoc
# Return offset
# Is there enough room for this string in the current ctoc record?
if 0xfbf8 - self._ctoc.tell() < 2 + len(ctoc_str):
# flush this ctoc, start a new one
print "closing ctoc_record at 0x%X" % self._ctoc.tell()
print "starting new ctoc with '%-50.50s ...'" % ctoc_str
# pad with 00
pad = 0xfbf8 - self._ctoc.tell()
print "padding %d bytes of 00" % pad
self._ctoc.write('\0' * (pad))
self._ctoc_records.append(self._ctoc.getvalue())
self._ctoc.truncate(0)
self._ctoc_offset += 0x10000
record_offset = self._ctoc_offset
offset = self._ctoc.tell() + record_offset
self._ctoc.write(decint(len(ctoc_str), DECINT_FORWARD) + ctoc_str)
return offset
def _add_flat_ctoc_node(self, node, ctoc, title=None): def _add_flat_ctoc_node(self, node, ctoc, title=None):
# Process 'chapter' or 'article' nodes only, force either to 'chapter' # Process 'chapter' or 'article' nodes only, force either to 'chapter'
t = node.title if title is None else title t = node.title if title is None else title
@ -1803,8 +1837,9 @@ class MobiWriter(object):
ctoc_name_map['klass'] = node.klass ctoc_name_map['klass'] = node.klass
# Add title offset to name map # Add title offset to name map
ctoc_name_map['titleOffset'] = ctoc.tell() # ctoc_name_map['titleOffset'] = ctoc.tell()
ctoc.write(decint(len(t), DECINT_FORWARD)+t) # ctoc.write(decint(len(t), DECINT_FORWARD)+t)
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1 self._chapterCount += 1
# append this node's name_map to map # append this node's name_map to map
@ -1815,6 +1850,8 @@ class MobiWriter(object):
def _add_structured_ctoc_node(self, node, ctoc, title=None): def _add_structured_ctoc_node(self, node, ctoc, title=None):
# Process 'periodical', 'section' and 'article' # Process 'periodical', 'section' and 'article'
# Fetch the offset referencing the current ctoc_record
if node.klass is None : if node.klass is None :
return return
t = node.title if title is None else title t = node.title if title is None else title
@ -1829,14 +1866,16 @@ class MobiWriter(object):
if node.klass == 'chapter': if node.klass == 'chapter':
# Add title offset to name map # Add title offset to name map
ctoc_name_map['titleOffset'] = ctoc.tell() # ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
ctoc.write(decint(len(t), DECINT_FORWARD)+t) # ctoc.write(decint(len(t), DECINT_FORWARD)+t)
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
self._chapterCount += 1 self._chapterCount += 1
elif node.klass == 'periodical' : elif node.klass == 'periodical' :
# Add title offset # Add title offset
ctoc_name_map['titleOffset'] = ctoc.tell() # ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
ctoc.write(decint(len(t), DECINT_FORWARD)+t) # ctoc.write( decint(len(t), DECINT_FORWARD) + t )
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'periodical' in _ctoc_map # Look for existing class entry 'periodical' in _ctoc_map
for entry in self._ctoc_map: for entry in self._ctoc_map:
@ -1847,15 +1886,18 @@ class MobiWriter(object):
else : else :
continue continue
else: else:
ctoc_name_map['classOffset'] = ctoc.tell() # class names should always be in CNCX 0 - no offset
ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass) # ctoc_name_map['classOffset'] = ctoc.tell()
# ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._periodicalCount += 1 self._periodicalCount += 1
elif node.klass == 'section' : elif node.klass == 'section' :
# Add title offset # Add title offset
ctoc_name_map['titleOffset'] = ctoc.tell() # ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
ctoc.write(decint(len(t), DECINT_FORWARD)+t) # ctoc.write(decint(len(t), DECINT_FORWARD)+t)
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'section' in _ctoc_map # Look for existing class entry 'section' in _ctoc_map
for entry in self._ctoc_map: for entry in self._ctoc_map:
@ -1866,15 +1908,18 @@ class MobiWriter(object):
else : else :
continue continue
else: else:
ctoc_name_map['classOffset'] = ctoc.tell() # class names should always be in CNCX 0 - no offset
ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass) # ctoc_name_map['classOffset'] = ctoc.tell()
# ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
self._sectionCount += 1 self._sectionCount += 1
elif node.klass == 'article' : elif node.klass == 'article' :
# Add title offset/title # Add title offset/title
ctoc_name_map['titleOffset'] = ctoc.tell() # ctoc_name_map['titleOffset'] = ctoc.tell() + ctoc_offset
ctoc.write(decint(len(t), DECINT_FORWARD)+t) # ctoc.write(decint(len(t), DECINT_FORWARD)+t)
ctoc_name_map['titleOffset'] = self._add_to_ctoc(t, self._ctoc_offset)
# Look for existing class entry 'article' in _ctoc_map # Look for existing class entry 'article' in _ctoc_map
for entry in self._ctoc_map: for entry in self._ctoc_map:
@ -1884,22 +1929,26 @@ class MobiWriter(object):
else : else :
continue continue
else: else:
ctoc_name_map['classOffset'] = ctoc.tell() # class names should always be in CNCX 0 - no offset
ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass) # ctoc_name_map['classOffset'] = ctoc.tell()
# ctoc.write(decint(len(node.klass), DECINT_FORWARD)+node.klass)
ctoc_name_map['classOffset'] = self._add_to_ctoc(node.klass, 0)
# Add description offset/description # Add description offset/description
if node.description : if node.description :
d = self._clean_text_value(node.description) d = self._clean_text_value(node.description)
ctoc_name_map['descriptionOffset'] = ctoc.tell() # ctoc_name_map['descriptionOffset'] = ctoc.tell() + ctoc_offset
ctoc.write(decint(len(d), DECINT_FORWARD)+d) # ctoc.write(decint(len(d), DECINT_FORWARD)+d)
ctoc_name_map['descriptionOffset'] = self._add_to_ctoc(d, self._ctoc_offset)
else : else :
ctoc_name_map['descriptionOffset'] = None ctoc_name_map['descriptionOffset'] = None
# Add author offset/description # Add author offset/attribution
if node.author : if node.author :
a = self._clean_text_value(node.author) a = self._clean_text_value(node.author)
ctoc_name_map['authorOffset'] = ctoc.tell() # ctoc_name_map['authorOffset'] = ctoc.tell() + ctoc_offset
ctoc.write(decint(len(a), DECINT_FORWARD)+a) # ctoc.write(decint(len(a), DECINT_FORWARD)+a)
ctoc_name_map['authorOffset'] = self._add_to_ctoc(a, self._ctoc_offset)
else : else :
ctoc_name_map['authorOffset'] = None ctoc_name_map['authorOffset'] = None
@ -1909,10 +1958,11 @@ class MobiWriter(object):
raise NotImplementedError( \ raise NotImplementedError( \
'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \ 'writer._generate_ctoc.add_node: title: %s has unrecognized klass: %s, playOrder: %d' % \
(node.title, node.klass, node.play_order)) (node.title, node.klass, node.play_order))
# append this node's name_map to map # append this node's name_map to map
self._ctoc_map.append(ctoc_name_map) self._ctoc_map.append(ctoc_name_map)
def _generate_ctoc(self): def _generate_ctoc(self):
# Generate the compiled TOC strings # Generate the compiled TOC strings
# Each node has 1-4 CTOC entries: # Each node has 1-4 CTOC entries:
@ -1931,7 +1981,8 @@ class MobiWriter(object):
reduced_toc = [] reduced_toc = []
self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets self._ctoc_map = [] # per node dictionary of {class/title/desc/author} offsets
self._last_toc_entry = None self._last_toc_entry = None
ctoc = StringIO() #ctoc = StringIO()
self._ctoc = StringIO()
# Track the individual node types # Track the individual node types
self._periodicalCount = 0 self._periodicalCount = 0
@ -1946,8 +1997,9 @@ class MobiWriter(object):
for (child) in toc.iter(): for (child) in toc.iter():
if self.opts.verbose > 2 : if self.opts.verbose > 2 :
self._oeb.logger.info(" %s" % child) self._oeb.logger.info(" %s" % child)
self._add_structured_ctoc_node(child, ctoc) self._add_structured_ctoc_node(child, self._ctoc)
first = False first = False
else : else :
self._oeb.logger.info('Generating flat CTOC ...') self._oeb.logger.info('Generating flat CTOC ...')
previousOffset = -1 previousOffset = -1
@ -1979,7 +2031,7 @@ class MobiWriter(object):
# print "_generate_ctoc: child offset: 0x%X" % currentOffset # print "_generate_ctoc: child offset: 0x%X" % currentOffset
if currentOffset != previousOffset : if currentOffset != previousOffset :
self._add_flat_ctoc_node(child, ctoc) self._add_flat_ctoc_node(child, self._ctoc)
reduced_toc.append(child) reduced_toc.append(child)
previousOffset = currentOffset previousOffset = currentOffset
else : else :
@ -2021,8 +2073,13 @@ class MobiWriter(object):
(self._periodicalCount, self._sectionCount, self._articleCount) ) (self._periodicalCount, self._sectionCount, self._articleCount) )
else : else :
self._oeb.logger.info("chapterCount: %d" % self._chapterCount) self._oeb.logger.info("chapterCount: %d" % self._chapterCount)
if True:
rec_count = len(self._ctoc_records)
self._oeb.logger.info(" CNCX utilization: %d %s %.0f%% full" % \
(rec_count + 1, 'records, last record' if rec_count else 'record,', len(self._ctoc.getvalue())/655) )
return align_block(ctoc.getvalue()) return align_block(self._ctoc.getvalue())
def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) : def _write_periodical_node(self, indxt, indices, index, offset, length, count, firstSection, lastSection) :
pos = 0xc0 + indxt.tell() pos = 0xc0 + indxt.tell()
@ -2341,7 +2398,7 @@ class MobiWriter(object):
return last_name, c return last_name, c
def _generate_indxt(self, ctoc): def _generate_indxt(self):
# Assumption: child.depth() represents nestedness of the TOC. # Assumption: child.depth() represents nestedness of the TOC.
# A flat document (book) has a depth of 2: # A flat document (book) has a depth of 2:
# <navMap> child.depth() = 2 # <navMap> child.depth() = 2

View File

@ -5,7 +5,6 @@ __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
latimes.com latimes.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LATimes(BasicNewsRecipe): class LATimes(BasicNewsRecipe):
@ -14,8 +13,7 @@ class LATimes(BasicNewsRecipe):
description = u'News from Los Angeles' description = u'News from Los Angeles'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
@ -41,19 +39,24 @@ class LATimes(BasicNewsRecipe):
.subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;} .subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
''' '''
# recursions = 1
# match_regexps = [r'http://www.latimes.com/.*page=[2-9]']
keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })] keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })]
remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate","entry-footer-left","entry-footer-right"]}),
dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}), dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
dict(name='ul', attrs={'class':["article-nav clearfix",]}),
dict(name='p', attrs={'class':["entry-footer",]}), dict(name='p', attrs={'class':["entry-footer",]}),
dict(name='ul', attrs={'class':"article-nav clearfix"}),
dict(name=['iframe']) dict(name=['iframe'])
] ]
feeds = [(u'News', u'http://feeds.latimes.com/latimes/news') feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')
,(u'Local','http://feeds.latimes.com/latimes/news/local') ,(u'Local','http://feeds.latimes.com/latimes/news/local')
,(u'Most Emailed','http://feeds.latimes.com/MostEmailed') ,(u'MostEmailed','http://feeds.latimes.com/MostEmailed')
,(u'California Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/') ,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/') ,('OrangeCounty','http://feeds.latimes.com/latimes/news/local/orange/')
,('National','http://feeds.latimes.com/latimes/news/nationworld/nation') ,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
,('Politics','http://feeds.latimes.com/latimes/news/politics/') ,('Politics','http://feeds.latimes.com/latimes/news/politics/')
@ -62,5 +65,22 @@ class LATimes(BasicNewsRecipe):
,('Entertainment','http://feeds.latimes.com/latimes/entertainment/') ,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
] ]
def get_article_url(self, article): def get_article_url(self, article):
return article.get('feedburner_origlink') ans = article.get('feedburner_origlink').rpartition('?')[0]
try:
self.log('Looking for full story link in', ans)
soup = self.index_to_soup(ans)
x = soup.find(text="single page")
if x is not None:
a = x.parent
if a and a.has_key('href'):
ans = 'http://www.latimes.com'+a['href']
self.log('Found full story link', ans)
except:
pass
return ans