diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index bd47262563..a4899b7187 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -7,13 +7,11 @@ usatoday.com ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag -import re class USAToday(BasicNewsRecipe): title = 'USA Today' - __author__ = 'GRiker' + __author__ = 'Kovid Goyal' oldest_article = 1 timefmt = '' max_articles_per_feed = 20 @@ -31,7 +29,6 @@ class USAToday(BasicNewsRecipe): margin-bottom: 0em; \ font-size: smaller;}\n \ .articleBody {text-align: left;}\n ' - conversion_options = { 'linearize_tables' : True } #simultaneous_downloads = 1 feeds = [ ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), @@ -47,63 +44,26 @@ class USAToday(BasicNewsRecipe): ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'), ] - keep_only_tags = [dict(attrs={'class':[ - 'byLine', - 'inside-copy', - 'inside-head', - 'inside-head2', - 'item', - 'item-block', - 'photo-container', - ]}), - dict(id=[ - 'applyMainStoryPhoto', - 'permalink', - ])] + keep_only_tags = [dict(attrs={'class':'story'})] + remove_tags = [ + dict(attrs={'class':[ + 'share', + 'reprints', + 'inline-h3', + 'info-extras', + 'ppy-outer', + 'ppy-caption', + 'comments', + 'jump', + 'pagetools', + 'post-attributes', + 'tags', + 'bottom-tools', + 'sponsoredlinks', + ]}), + dict(id=['pluck']), + ] - remove_tags = [dict(attrs={'class':[ - 'comments', - 'jump', - 'pagetools', - 'post-attributes', - 'tags', - ]}), - dict(id=[])] - - #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')] - - def dump_hex(self, src, length=16): - ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - - return fixed def get_masthead_url(self): masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' @@ -115,321 +75,4 @@ class USAToday(BasicNewsRecipe): masthead = None return masthead - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description - def parse_feeds(self, *args, **kwargs): - parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs) - # Count articles for progress dialog - article_count = 0 - for feed in parsed_feeds: - article_count += len(feed) - self.log( "Queued %d articles" % article_count) - return parsed_feeds - - def preprocess_html(self, soup): - soup = self.strip_anchors(soup) - return soup - - def postprocess_html(self, soup, first_fetch): - - # Remove navLinks
- paras = soup.findAll(['p']) - for p in paras: - if hasattr(p,'contents') and len(p.contents): - for punc in jumpout_punc_list: - punc_offset = p.contents[0].find(punc) - if punc_offset == -1: - continue - if punc_offset > 2 and hasattr(p,'a') and len(p.contents): - #print "evaluating %s\n" % p.contents[0][:punc_offset+1] - if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): - #print "extracting \n%s\n" % p.prettify() - p.extract() - - # Capture the first img, insert after headline - imgs = soup.findAll('img') - print "postprocess_html(): %d images" % len(imgs) - if imgs: - divTag = Tag(soup, 'div') - divTag['class'] = 'image' - body = soup.find('body') - img = imgs[0] - #print "img: \n%s\n" % img.prettify() - - # Table for photo and credit - tableTag = Tag(soup,'table') - - # Photo - trimgTag = Tag(soup, 'tr') - tdimgTag = Tag(soup, 'td') - tdimgTag.insert(0,img) - trimgTag.insert(0,tdimgTag) - tableTag.insert(0,trimgTag) - - # Credit - trcreditTag = Tag(soup, 'tr') - - tdcreditTag = Tag(soup, 'td') - tdcreditTag['class'] = 'credit' - credit = soup.find('td',{'class':'photoCredit'}) - if credit: - tdcreditTag.insert(0,NavigableString(credit.renderContents())) - else: - credit = img['credit'] - if credit: - tdcreditTag.insert(0,NavigableString(credit)) - else: - tdcreditTag.insert(0,NavigableString('')) - - trcreditTag.insert(0,tdcreditTag) - tableTag.insert(1,trcreditTag) - dtc = 0 - divTag.insert(dtc,tableTag) - dtc += 1 - - if False: - # Add the caption in the table - tableCaptionTag = Tag(soup,'caption') - tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents()) - tableTag.insert(1,tableCaptionTag) - divTag.insert(dtc,tableTag) - dtc += 1 - body.insert(1,divTag) - else: - # Add the caption below the table - #print "Looking for caption in this soup:\n%s" % img.prettify() - captionTag = Tag(soup,'p') - captionTag['class'] = 'caption' - if hasattr(img,'alt') and img['alt']: - captionTag.insert(0,NavigableString('
%s' % img['alt'])) - divTag.insert(dtc, captionTag) - dtc += 1 - else: - try: - captionTag.insert(0,NavigableString('
%s' % img['cutline'])) - divTag.insert(dtc, captionTag) - dtc += 1 - except: - pass - - hrTag = Tag(soup, 'hr') - divTag.insert(dtc, hrTag) - dtc += 1 - - # Delete