navLinks = soup.find(True,{'style':'padding-bottom:3px'}) if navLinks: navLinks.extract() # Remove

gibberish = soup.find(True,{'style':'margin-bottom:10px'}) if gibberish: gibberish.extract() # Change to

headline = soup.find(True, {'class':['inside-head','inside-head2']}) if not headline: headline = soup.find('h3') if headline: tag = Tag(soup, "h2") tag['class'] = "headline" tag.insert(0, headline.contents[0]) headline.replaceWith(tag) else: print "unable to find headline:\n%s\n" % soup # Change byLine to byline, change commas to middot # Kindle renders commas in byline as '&' byline = soup.find(True, {'class':'byLine'}) if byline: byline['class'] = 'byline' # Replace comma with middot byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents())) jumpout_punc_list = [':','?'] # Remove the inline jumpouts in
paras = soup.findAll(True, {'class':'inside-copy'}) for para in paras: if re.match("[\w\W]+ ",para.renderContents()): p = para.find('b') for punc in jumpout_punc_list: punc_offset = p.contents[0].find(punc) if punc_offset == -1: continue if punc_offset > 1: if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): #print "extracting \n%s\n" % para.prettify() para.extract() # Reset class for remaining paras = soup.findAll(True, {'class':'inside-copy'}) for para in paras: para['class'] = 'articleBody' # Remove inline jumpouts in
paras = soup.findAll(['p']) for p in paras: if hasattr(p,'contents') and len(p.contents): for punc in jumpout_punc_list: punc_offset = p.contents[0].find(punc) if punc_offset == -1: continue if punc_offset > 2 and hasattr(p,'a') and len(p.contents): #print "evaluating %s\n" % p.contents[0][:punc_offset+1] if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): #print "extracting \n%s\n" % p.prettify() p.extract() # Capture the first img, insert after headline imgs = soup.findAll('img') print "postprocess_html(): %d images" % len(imgs) if imgs: divTag = Tag(soup, 'div') divTag['class'] = 'image' body = soup.find('body') img = imgs[0] #print "img: \n%s\n" % img.prettify() # Table for photo and credit tableTag = Tag(soup,'table') # Photo trimgTag = Tag(soup, 'tr') tdimgTag = Tag(soup, 'td') tdimgTag.insert(0,img) trimgTag.insert(0,tdimgTag) tableTag.insert(0,trimgTag) # Credit trcreditTag = Tag(soup, 'tr') tdcreditTag = Tag(soup, 'td') tdcreditTag['class'] = 'credit' credit = soup.find('td',{'class':'photoCredit'}) if credit: tdcreditTag.insert(0,NavigableString(credit.renderContents())) else: credit = img['credit'] if credit: tdcreditTag.insert(0,NavigableString(credit)) else: tdcreditTag.insert(0,NavigableString('')) trcreditTag.insert(0,tdcreditTag) tableTag.insert(1,trcreditTag) dtc = 0 divTag.insert(dtc,tableTag) dtc += 1 if False: # Add the caption in the table tableCaptionTag = Tag(soup,'caption') tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents()) tableTag.insert(1,tableCaptionTag) divTag.insert(dtc,tableTag) dtc += 1 body.insert(1,divTag) else: # Add the caption below the table #print "Looking for caption in this soup:\n%s" % img.prettify() captionTag = Tag(soup,'p') captionTag['class'] = 'caption' if hasattr(img,'alt') and img['alt']: captionTag.insert(0,NavigableString('
%s
' % img['alt'])) divTag.insert(dtc, captionTag) dtc += 1 else: try: captionTag.insert(0,NavigableString('
%s
' % img['cutline'])) divTag.insert(dtc, captionTag) dtc += 1 except: pass hrTag = Tag(soup, 'hr') divTag.insert(dtc, hrTag) dtc += 1 # Delete
- restructure tag = body.find(True) while True: insertLoc += 1 try: if hasattr(tag,'class') and tag['class'] == 'headline': headline_found = True tag.insert(insertLoc,divTag) break except: pass tag = tag.next if not tag: break # Yank out headline, img and caption headline = body.find('h2','headline') img = body.find('div','image') caption = body.find('p''class') # body(0) is calibre_navbar # body(1) is
btc = 1 headline.extract() body.insert(1, headline) btc += 1 if img: img.extract() body.insert(btc, img) btc += 1 if caption: caption.extract() body.insert(btc, caption) btc += 1 if len(imgs) > 1: if True: [img.extract() for img in imgs[1:]] else: # Format the remaining images # This doesn't work yet for img in imgs[1:]: print "img:\n%s\n" % img.prettify() divTag = Tag(soup, 'div') divTag['class'] = 'image' # Table for photo and credit tableTag = Tag(soup,'table') # Photo trimgTag = Tag(soup, 'tr') tdimgTag = Tag(soup, 'td') tdimgTag.insert(0,img) trimgTag.insert(0,tdimgTag) tableTag.insert(0,trimgTag) # Credit trcreditTag = Tag(soup, 'tr') tdcreditTag = Tag(soup, 'td') tdcreditTag['class'] = 'credit' try: tdcreditTag.insert(0,NavigableString(img['credit'])) except: tdcreditTag.insert(0,NavigableString('')) trcreditTag.insert(0,tdcreditTag) tableTag.insert(1,trcreditTag) divTag.insert(0,tableTag) soup.img.replaceWith(divTag) return soup def postprocess_book(self, oeb, opts, log) : def extract_byline(href) : # '' : return self.massageNCXText(self.tag_to_string(p,use_alt=False)) else: print "Didn't find
in this soup:\n%s" % soup.prettify() return None # Method entry point here # Single section toc looks different than multi-section tocs if oeb.toc.depth() == 2 : for article in oeb.toc : if article.author is None : article.author = extract_byline(article.href) if article.description is None : article.description = extract_description(article.href) elif oeb.toc.depth() == 3 : for section in oeb.toc : for article in section : article.author = extract_byline(article.href) ''' if article.author is None : article.author = self.massageNCXText(extract_byline(article.href)) else: article.author = self.massageNCXText(article.author) ''' if article.description is None : article.description = extract_description(article.href) def strip_anchors(self,soup): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup