diff --git a/resources/recipes/usatoday.recipe b/resources/recipes/usatoday.recipe index 1a314f652e..368437a709 100644 --- a/resources/recipes/usatoday.recipe +++ b/resources/recipes/usatoday.recipe @@ -7,62 +7,430 @@ usatoday.com ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag import re class USAToday(BasicNewsRecipe): title = 'USA Today' - timefmt = ' [%d %b %Y]' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'GRiker' + oldest_article = 1 + timefmt = '' max_articles_per_feed = 20 language = 'en' - - no_stylesheets = True - extra_css = ''' - .inside-head{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } - .inside-head2{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } - .inside-head3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold } - h3{font-family:Arial,Helvetica,sans-serif; font-size:large; font-weight:bold; } - h4{font-family:Arial,Helvetica,sans-serif; font-size:x-small; font-weight:bold; } - .side-by-side{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} - #byLineTag{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} - .inside-copy{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left;} - .caption{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} - li{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;} - .vatext{font-family:Arial,Helvetica,sans-serif; font-size:x-small;text-align:left ;} - .vaTextBold{font-family:Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold; color:#666666;} - ''' - remove_tags = [ - {'class':['tagListLabel','piped-taglist-string','socialcontainer','social-wrapper',]}, - {'id':['topSocialButtons']}, - ] - + extra_css = '.headline {text-align: left;}\n \ + .byline {font-family: monospace; \ + text-align: left; \ + margin-bottom: 1em;}\n \ + .image {text-align: center;}\n \ + .caption {text-align: center; \ + font-size: smaller; \ + font-style: italic}\n \ + .credit {text-align: right; \ + margin-bottom: 0em; \ + font-size: smaller;}\n \ + .articleBody {text-align: left;}\n ' conversion_options = { 'linearize_tables' : True } - - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''), - ] - - + #simultaneous_downloads = 1 feeds = [ ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), - ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), + ('Personal Tech', 'http://rssfeeds.usatoday.com/UsatodaycomTech-PersonalTalk'), + ('Science', 'http://rssfeeds.usatoday.com/TP-ScienceFair'), + ('Health', 'http://rssfeeds.usatoday.com/UsatodaycomHealth-TopStories'), ('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'), ('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'), ('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'), + ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), + ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'), ] + keep_only_tags = [dict(attrs={'class':[ + 'byLine', + 'inside-copy', + 'inside-head', + 'inside-head2', + 'item', + 'item-block', + 'photo-container', + ]}), + dict(id=[ + 'applyMainStoryPhoto', + 'permalink', + ])] - ## Getting the print version + remove_tags = [dict(attrs={'class':[ + 'comments', + 'jump', + 'pagetools', + 'post-attributes', + 'tags', + ]}), + dict(id=[])] - def print_version(self, url): - return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url + #feeds = [('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles')] + + def dump_hex(self, src, length=16): + ''' Diagnostic ''' + FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + N=0; result='' + while src: + s,src = src[:length],src[length:] + hexa = ' '.join(["%02X"%ord(x) for x in s]) + s = s.translate(FILTER) + result += "%04X %-*s %s\n" % (N, length*3, hexa, s) + N+=length + print result + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed + + def get_masthead_url(self): + masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def parse_feeds(self, *args, **kwargs): + parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs) + # Count articles for progress dialog + content_feeds = [] + article_count = 0 + for feed in parsed_feeds: + article_count += len(feed) + self.log( "Queued %d articles" % article_count) + return parsed_feeds + + def preprocess_html(self, soup): + soup = self.strip_anchors(soup) + return soup def postprocess_html(self, soup, first_fetch): - for t in soup.findAll(['table', 'tr', 'td']): - t.name = 'div' + + # Remove navLinks
+ navLinks = soup.find(True,{'style':'padding-bottom:3px'}) + if navLinks: + navLinks.extract() + + # Remove
+ gibberish = soup.find(True,{'style':'margin-bottom:10px'}) + if gibberish: + gibberish.extract() + + # Change to

+ headline = soup.find(True, {'class':['inside-head','inside-head2']}) + if not headline: + headline = soup.find('h3') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, headline.contents[0]) + headline.replaceWith(tag) + else: + print "unable to find headline:\n%s\n" % soup + + # Change byLine to byline, change commas to middot + # Kindle renders commas in byline as '&' + byline = soup.find(True, {'class':'byLine'}) + if byline: + byline['class'] = 'byline' + # Replace comma with middot + byline.contents[0].replaceWith(re.sub(","," ·", byline.renderContents())) + + jumpout_punc_list = [':','?'] + # Remove the inline jumpouts in
+ paras = soup.findAll(True, {'class':'inside-copy'}) + for para in paras: + if re.match("[\w\W]+ ",para.renderContents()): + p = para.find('b') + for punc in jumpout_punc_list: + punc_offset = p.contents[0].find(punc) + if punc_offset == -1: + continue + if punc_offset > 1: + if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): + #print "extracting \n%s\n" % para.prettify() + para.extract() + + # Reset class for remaining + paras = soup.findAll(True, {'class':'inside-copy'}) + for para in paras: + para['class'] = 'articleBody' + + # Remove inline jumpouts in

+ paras = soup.findAll(['p']) + for p in paras: + if hasattr(p,'contents') and len(p.contents): + for punc in jumpout_punc_list: + punc_offset = p.contents[0].find(punc) + if punc_offset == -1: + continue + if punc_offset > 2 and hasattr(p,'a') and len(p.contents): + #print "evaluating %s\n" % p.contents[0][:punc_offset+1] + if p.contents[0][:punc_offset] == p.contents[0][:punc_offset].upper(): + #print "extracting \n%s\n" % p.prettify() + p.extract() + + # Capture the first img, insert after headline + imgs = soup.findAll('img') + print "postprocess_html(): %d images" % len(imgs) + if imgs: + divTag = Tag(soup, 'div') + divTag['class'] = 'image' + body = soup.find('body') + img = imgs[0] + #print "img: \n%s\n" % img.prettify() + + # Table for photo and credit + tableTag = Tag(soup,'table') + + # Photo + trimgTag = Tag(soup, 'tr') + tdimgTag = Tag(soup, 'td') + tdimgTag.insert(0,img) + trimgTag.insert(0,tdimgTag) + tableTag.insert(0,trimgTag) + + # Credit + trcreditTag = Tag(soup, 'tr') + + tdcreditTag = Tag(soup, 'td') + tdcreditTag['class'] = 'credit' + credit = soup.find('td',{'class':'photoCredit'}) + if credit: + tdcreditTag.insert(0,NavigableString(credit.renderContents())) + else: + credit = img['credit'] + if credit: + tdcreditTag.insert(0,NavigableString(credit)) + else: + tdcreditTag.insert(0,NavigableString('')) + + trcreditTag.insert(0,tdcreditTag) + tableTag.insert(1,trcreditTag) + dtc = 0 + divTag.insert(dtc,tableTag) + dtc += 1 + + if False: + # Add the caption in the table + tableCaptionTag = Tag(soup,'caption') + tableCaptionTag.insert(0,soup.find('td',{'class':'photoCredit'}).renderContents()) + tableTag.insert(1,tableCaptionTag) + divTag.insert(dtc,tableTag) + dtc += 1 + body.insert(1,divTag) + else: + # Add the caption below the table + #print "Looking for caption in this soup:\n%s" % img.prettify() + captionTag = Tag(soup,'p') + captionTag['class'] = 'caption' + if hasattr(img,'alt') and img['alt']: + captionTag.insert(0,NavigableString('

%s
' % img['alt'])) + divTag.insert(dtc, captionTag) + dtc += 1 + else: + try: + captionTag.insert(0,NavigableString('
%s
' % img['cutline'])) + divTag.insert(dtc, captionTag) + dtc += 1 + except: + pass + + hrTag = Tag(soup, 'hr') + divTag.insert(dtc, hrTag) + dtc += 1 + + # Delete
- restructure + insert_loc = 0 + tag = body.find(True) + while True: + insertLoc += 1 + try: + if hasattr(tag,'class') and tag['class'] == 'headline': + headline_found = True + tag.insert(insertLoc,divTag) + break + except: + pass + tag = tag.next + if not tag: + break + + # Yank out headline, img and caption + headline = body.find('h2','headline') + img = body.find('div','image') + caption = body.find('p''class') + + # body(0) is calibre_navbar + # body(1) is
+ + btc = 1 + headline.extract() + body.insert(1, headline) + btc += 1 + if img: + img.extract() + body.insert(btc, img) + btc += 1 + if caption: + caption.extract() + body.insert(btc, caption) + btc += 1 + + if len(imgs) > 1: + if True: + [img.extract() for img in imgs[1:]] + else: + # Format the remaining images + # This doesn't work yet + for img in imgs[1:]: + print "img:\n%s\n" % img.prettify() + divTag = Tag(soup, 'div') + divTag['class'] = 'image' + + # Table for photo and credit + tableTag = Tag(soup,'table') + + # Photo + trimgTag = Tag(soup, 'tr') + tdimgTag = Tag(soup, 'td') + tdimgTag.insert(0,img) + trimgTag.insert(0,tdimgTag) + tableTag.insert(0,trimgTag) + + # Credit + trcreditTag = Tag(soup, 'tr') + + tdcreditTag = Tag(soup, 'td') + tdcreditTag['class'] = 'credit' + try: + tdcreditTag.insert(0,NavigableString(img['credit'])) + except: + tdcreditTag.insert(0,NavigableString('')) + trcreditTag.insert(0,tdcreditTag) + tableTag.insert(1,trcreditTag) + divTag.insert(0,tableTag) + soup.img.replaceWith(divTag) + + return soup + + def postprocess_book(self, oeb, opts, log) : + + def extract_byline(href) : + # '' : + return self.massageNCXText(self.tag_to_string(p,use_alt=False)) + else: + print "Didn't find
in this soup:\n%s" % soup.prettify() + return None + + # Method entry point here + # Single section toc looks different than multi-section tocs + if oeb.toc.depth() == 2 : + for article in oeb.toc : + if article.author is None : + article.author = extract_byline(article.href) + if article.description is None : + article.description = extract_description(article.href) + elif oeb.toc.depth() == 3 : + for section in oeb.toc : + for article in section : + article.author = extract_byline(article.href) + ''' + if article.author is None : + article.author = self.massageNCXText(extract_byline(article.href)) + else: + article.author = self.massageNCXText(article.author) + ''' + if article.description is None : + article.description = extract_description(article.href) + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup