from calibre.web.feeds.news import BasicNewsRecipe import re class TelevisionWithoutPity(BasicNewsRecipe): title = u'Television Without Pity' language = 'en' __author__ = 'Snarkastica' SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/' # Used for pulling down an entire show, not just the RSS feed oldest_article = 7 # days max_articles_per_feed = 25 # reverse_article_order=True # Useful for entire show, to display in episode order #encoding = 'cp1252' use_embedded_content = False preprocess_regexps = [(re.compile(r'')] keep_only_tags = [dict(name='span', attrs={'class':'headline_recap_title'}), dict( name='p', attrs={'class':'byline'}), dict(name='div', attrs={'class':'body_recap'}), dict(name='h1')] no_stylesheets = True # Comment this out and configure process_index() to retrieve a single show feeds = [ ('Ltest Recaps', 'http://www.televisionwithoutpity.com/rss.xml'), ] ''' This method can be used to grab all recaps for a single show Set the SHOW constant at the beginning of this file to the URL for a show's recap page (the page listing all recaps, usually of the form: http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/" Where SHOW-NAME is the hyphenated name of the show. To use: 1. Comment out feeds = [...] earlier in this file 2. Set the SHOW constant to the show's recap page 3. Uncomment the following function ''' ''' def parse_index(self): soup = self.index_to_soup(self.SHOW) feeds = [] articles = [] showTitle = soup.find('h1').string recaps = soup.find('table') for ep in recaps.findAll('tr'): epData = ep.findAll('td') epNum = epData[0].find(text=True).strip() if not epNum == "Ep.": epT = self.tag_to_string(epData[1].find('em')).strip() epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")" epTitle = epNum + ": " + epT + epST epData[1].find('em').extract() epURL = epData[1].find('a', href=True) epURL = epURL['href'] epSum = self.tag_to_string(epData[1].find('p')).strip() epDate = epData[2].find(text=True).strip() epAuthor = self.tag_to_string(epData[4].find('p')).strip() articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor}) feeds.append((showTitle, articles)) #self.abort_recipe_processing("test") return feeds ''' # This will add subsequent pages of multipage recaps to a single article page def append_page(self, soup, appendtag, position): if (soup.find('p',attrs={'class':'pages'})): # If false, will still grab single-page recaplets pager = soup.find('p',attrs={'class':'pages'}).find(text='Next') if pager: nexturl = pager.parent['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class':'body_recap'}) for it in texttag.findAll(style=True): del it['style'] newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) texttag.extract() appendtag.insert(position,texttag) def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) return soup # Remove the multi page links (we had to keep these in for append_page(), but they can go away now # Could have used CSS to hide, but some readers ignore CSS. def postprocess_html(self, soup, first_fetch): paginator = soup.findAll('p', attrs={'class':'pages'}) if paginator: for p in paginator: p.extract() # TODO: Fix this so it converts the headline class into a heading 1 #titleTag = Tag(soup, "h1") #repTag = soup.find('span', attrs={'class':'headline_recap_title'}) #titleTag.insert(0, repTag.contents[0]) # repTag.extract() #soup.body.insert(1, titleTag) return soup