From bfbd42dd6d0bc5494162cea64c981e46ab8ab8be Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 4 May 2011 10:39:20 -0600 Subject: [PATCH] Fix USA Today --- recipes/usatoday.recipe | 397 ++-------------------------------------- 1 file changed, 20 insertions(+), 377 deletions(-) diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index bd47262563..a4899b7187 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -7,13 +7,11 @@ usatoday.com ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag -import re class USAToday(BasicNewsRecipe): title = 'USA Today' - __author__ = 'GRiker' + __author__ = 'Kovid Goyal' oldest_article = 1 timefmt = '' max_articles_per_feed = 20 @@ -31,7 +29,6 @@ class USAToday(BasicNewsRecipe): margin-bottom: 0em; \ font-size: smaller;}\n \ .articleBody {text-align: left;}\n ' - conversion_options = { 'linearize_tables' : True } #simultaneous_downloads = 1 feeds = [ ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), @@ -47,63 +44,26 @@ class USAToday(BasicNewsRecipe): ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'), ] - keep_only_tags = [dict(attrs={'class':[ - 'byLine', - 'inside-copy', - 'inside-head', - 'inside-head2', - 'item', - 'item-block', - 'photo-container', - ]}), - dict(id=[ - 'applyMainStoryPhoto', - 'permalink', - ])] + keep_only_tags = [dict(attrs={'class':'story'})] + remove_tags = [ + dict(attrs={'class':[ + 'share', + 'reprints', + 'inline-h3', + 'info-extras', + 'ppy-outer', + 'ppy-caption', + 'comments', + 'jump', + 'pagetools', + 'post-attributes', + 'tags', + 'bottom-tools', + 'sponsoredlinks', + ]}), + dict(id=['pluck']), + ] - remove_tags = [dict(attrs={'class':[ - 'comments', - 'jump', - 'pagetools', - 'post-attributes', - 'tags', - ]}), - dict(id=[])] - - #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')] - - def dump_hex(self, src, length=16): - ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - - return fixed def get_masthead_url(self): masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' @@ -115,321 +75,4 @@ class USAToday(BasicNewsRecipe): masthead = None return masthead - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description - def parse_feeds(self, *args, **kwargs): - parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs) - # Count articles for progress dialog - article_count = 0 - for feed in parsed_feeds: - article_count += len(feed) - self.log( "Queued %d articles" % article_count) - return parsed_feeds - - def preprocess_html(self, soup): - soup = self.strip_anchors(soup) - return soup - - def postprocess_html(self, soup, first_fetch): - - # Remove navLinks
- navLinks = soup.find(True,{'style':'padding-bottom:3px'}) - if navLinks: - navLinks.extract() - - # Remove
- gibberish = soup.find(True,{'style':'margin-bottom:10px'}) - if gibberish: - gibberish.extract() - - # Change to

- headline = soup.find(True, {'class':['inside-head','inside-head2']}) - if not headline: - headline = soup.find('h3') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, headline.contents[0]) - headline.replaceWith(tag) - else: - print "unable to find headline:\n%s\n" % soup - - # Change byLine to byline, change commas to middot - # Kindle renders commas in byline as '&' - byline = soup.find(True, {'class':'byLine'}) - if byline: - byline['class'] = 'byline' - # Replace comma with middot - byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents())) - - jumpout_punc_list = [':','?'] - # Remove the inline jumpouts in
- paras = soup.findAll(True, {'class':'inside-copy'}) - for para in paras: - if re.match("[\w\W]+ ",para.renderContents()): - p = para.find('b') - for punc in jumpout_punc_list: - punc_offset = p.contents[0].find(punc) - if punc_offset == -1: - continue - if punc_offset > 1: - if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): - #print "extracting \n%s\n" % para.prettify() - para.extract() - - # Reset class for remaining - paras = soup.findAll(True, {'class':'inside-copy'}) - for para in paras: - para['class'] = 'articleBody' - - # Remove inline jumpouts in

- paras = soup.findAll(['p']) - for p in paras: - if hasattr(p,'contents') and len(p.contents): - for punc in jumpout_punc_list: - punc_offset = p.contents[0].find(punc) - if punc_offset == -1: - continue - if punc_offset > 2 and hasattr(p,'a') and len(p.contents): - #print "evaluating %s\n" % p.contents[0][:punc_offset+1] - if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): - #print "extracting \n%s\n" % p.prettify() - p.extract() - - # Capture the first img, insert after headline - imgs = soup.findAll('img') - print "postprocess_html(): %d images" % len(imgs) - if imgs: - divTag = Tag(soup, 'div') - divTag['class'] = 'image' - body = soup.find('body') - img = imgs[0] - #print "img: \n%s\n" % img.prettify() - - # Table for photo and credit - tableTag = Tag(soup,'table') - - # Photo - trimgTag = Tag(soup, 'tr') - tdimgTag = Tag(soup, 'td') - tdimgTag.insert(0,img) - trimgTag.insert(0,tdimgTag) - tableTag.insert(0,trimgTag) - - # Credit - trcreditTag = Tag(soup, 'tr') - - tdcreditTag = Tag(soup, 'td') - tdcreditTag['class'] = 'credit' - credit = soup.find('td',{'class':'photoCredit'}) - if credit: - tdcreditTag.insert(0,NavigableString(credit.renderContents())) - else: - credit = img['credit'] - if credit: - tdcreditTag.insert(0,NavigableString(credit)) - else: - tdcreditTag.insert(0,NavigableString('')) - - trcreditTag.insert(0,tdcreditTag) - tableTag.insert(1,trcreditTag) - dtc = 0 - divTag.insert(dtc,tableTag) - dtc += 1 - - if False: - # Add the caption in the table - tableCaptionTag = Tag(soup,'caption') - tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents()) - tableTag.insert(1,tableCaptionTag) - divTag.insert(dtc,tableTag) - dtc += 1 - body.insert(1,divTag) - else: - # Add the caption below the table - #print "Looking for caption in this soup:\n%s" % img.prettify() - captionTag = Tag(soup,'p') - captionTag['class'] = 'caption' - if hasattr(img,'alt') and img['alt']: - captionTag.insert(0,NavigableString('

%s
' % img['alt'])) - divTag.insert(dtc, captionTag) - dtc += 1 - else: - try: - captionTag.insert(0,NavigableString('
%s
' % img['cutline'])) - divTag.insert(dtc, captionTag) - dtc += 1 - except: - pass - - hrTag = Tag(soup, 'hr') - divTag.insert(dtc, hrTag) - dtc += 1 - - # Delete
- restructure - tag = body.find(True) - while True: - insertLoc += 1 - try: - if hasattr(tag,'class') and tag['class'] == 'headline': - headline_found = True - tag.insert(insertLoc,divTag) - break - except: - pass - tag = tag.next - if not tag: - break - - # Yank out headline, img and caption - headline = body.find('h2','headline') - img = body.find('div','image') - caption = body.find('p''class') - - # body(0) is calibre_navbar - # body(1) is
- - btc = 1 - headline.extract() - body.insert(1, headline) - btc += 1 - if img: - img.extract() - body.insert(btc, img) - btc += 1 - if caption: - caption.extract() - body.insert(btc, caption) - btc += 1 - - if len(imgs) > 1: - if True: - [img.extract() for img in imgs[1:]] - else: - # Format the remaining images - # This doesn't work yet - for img in imgs[1:]: - print "img:\n%s\n" % img.prettify() - divTag = Tag(soup, 'div') - divTag['class'] = 'image' - - # Table for photo and credit - tableTag = Tag(soup,'table') - - # Photo - trimgTag = Tag(soup, 'tr') - tdimgTag = Tag(soup, 'td') - tdimgTag.insert(0,img) - trimgTag.insert(0,tdimgTag) - tableTag.insert(0,trimgTag) - - # Credit - trcreditTag = Tag(soup, 'tr') - - tdcreditTag = Tag(soup, 'td') - tdcreditTag['class'] = 'credit' - try: - tdcreditTag.insert(0,NavigableString(img['credit'])) - except: - tdcreditTag.insert(0,NavigableString('')) - trcreditTag.insert(0,tdcreditTag) - tableTag.insert(1,trcreditTag) - divTag.insert(0,tableTag) - soup.img.replaceWith(divTag) - - return soup - - def postprocess_book(self, oeb, opts, log) : - - def extract_byline(href) : - # '' : - return self.massageNCXText(self.tag_to_string(p,use_alt=False)) - else: - print "Didn't find
in this soup:\n%s" % soup.prettify() - return None - - # Method entry point here - # Single section toc looks different than multi-section tocs - if oeb.toc.depth() == 2 : - for article in oeb.toc : - if article.author is None : - article.author = extract_byline(article.href) - if article.description is None : - article.description = extract_description(article.href) - elif oeb.toc.depth() == 3 : - for section in oeb.toc : - for article in section : - article.author = extract_byline(article.href) - ''' - if article.author is None : - article.author = self.massageNCXText(extract_byline(article.href)) - else: - article.author = self.massageNCXText(article.author) - ''' - if article.description is None : - article.description = extract_description(article.href) - - def strip_anchors(self,soup): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) - return soup