#!/usr/bin/env python __license__ = 'GPL v3' ''' ''' from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds import Feed class ReadersDigest(BasicNewsRecipe): title = 'Readers Digest' __author__ = 'BrianG' language = 'en' description = 'Readers Digest Feeds' no_stylesheets = True use_embedded_content = False oldest_article = 60 max_articles_per_feed = 200 language = 'en' remove_javascript = True extra_css = ''' h1 {font-family:georgia,serif;color:#000000;} .mainHd{font-family:georgia,serif;color:#000000;} h2 {font-family:Arial,Sans-serif;} .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; } .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;} .byline{font-family:Arial,Sans-serif; font-size:x-small ;} .photoBkt{ font-size:x-small ;} .vertPhoto{font-size:x-small ;} .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} .artTxt{font-family:georgia,serif;} .caption{font-family:georgia,serif; font-size:x-small;color:#333333;} .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} a:link{color:#CC0000;} .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;} ''' remove_tags = [ dict(name='h4', attrs={'class':'close'}), dict(name='div', attrs={'class':'fromLine'}), dict(name='img', attrs={'class':'colorTag'}), dict(name='div', attrs={'id':'sponsorArticleHeader'}), dict(name='div', attrs={'class':'horizontalAd'}), dict(name='div', attrs={'id':'imageCounterLeft'}), dict(name='div', attrs={'id':'commentsPrint'}) ] feeds = [ ('New in RD', 'http://feeds.rd.com/ReadersDigest'), ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'), ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'), ('Blogs','http://feeds.rd.com/ReadersDigestBlogs') ] cover_url = 'http://www.rd.com/images/logo-main-rd.gif' #------------------------------------------------------------------------------------------------- def print_version(self, url): # Get the identity number of the current article and append it to the root print URL if url.find('/article') > 0: ident = url[url.find('/article')+8:url.find('.html?')-4] url = 'http://www.rd.com/content/printContent.do?contentId=' + ident elif url.find('/post') > 0: # in this case, have to get the page itself to derive the Print page. soup = self.index_to_soup(url) newsoup = soup.find('ul',attrs={'class':'printBlock'}) url = 'http://www.rd.com' + newsoup('a')[0]['href'] url = url[0:url.find('&Keep')] return url #------------------------------------------------------------------------------------------------- def parse_index(self): pages = [ ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}), # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}), ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'}) ] feeds = [] for page in pages: section, url, divider, attrList = page newArticles = self.page_parse(url, divider, attrList) feeds.append((section,newArticles)) # after the pages of the site have been processed, parse several RSS feeds for additional sections newfeeds = Feed() newfeeds = self.parse_rss() # The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable # for this module (parse_index). for feed in newfeeds: newArticles = [] for article in feed.articles: newArt = { 'title' : article.title, 'url' : article.url, 'date' : article.date, 'description' : article.text_summary } newArticles.append(newArt) # New and Blogs should be the first two feeds. if feed.title == 'New in RD': feeds.insert(0,(feed.title,newArticles)) elif feed.title == 'Blogs': feeds.insert(1,(feed.title,newArticles)) else: feeds.append((feed.title,newArticles)) return feeds #------------------------------------------------------------------------------------------------- def page_parse(self, mainurl, divider, attrList): articles = [] mainsoup = self.index_to_soup(mainurl) for item in mainsoup.findAll(attrs=attrList): newArticle = { 'title' : item('img')[0]['alt'], 'url' : 'http://www.rd.com'+item('a')[0]['href'], 'date' : '', 'description' : '' } articles.append(newArticle) return articles #------------------------------------------------------------------------------------------------- def parse_rss (self): # Do the "official" parse_feeds first feeds = BasicNewsRecipe.parse_feeds(self) # Loop thru the articles in all feeds to find articles with "recipe" in it recipeArticles = [] for curfeed in feeds: delList = [] for a,curarticle in enumerate(curfeed.articles): if curarticle.title.upper().find('RECIPE') >= 0: recipeArticles.append(curarticle) delList.append(curarticle) if len(delList)>0: for d in delList: index = curfeed.articles.index(d) curfeed.articles[index:index+1] = [] # If there are any recipes found, create a new Feed object and append. if len(recipeArticles) > 0: pfeed = Feed() pfeed.title = 'Recipes' pfeed.descrition = 'Recipe Feed (Virtual)' pfeed.image_url = None pfeed.oldest_article = 30 pfeed.id_counter = len(recipeArticles) # Create a new Feed, add the recipe articles, and then append # to "official" list of feeds pfeed.articles = recipeArticles[:] feeds.append(pfeed) return feeds