diff --git a/recipes/television_without_pity.recipe b/recipes/television_without_pity.recipe index e48bce5859..4cbc57205d 100644 --- a/recipes/television_without_pity.recipe +++ b/recipes/television_without_pity.recipe @@ -1,21 +1,98 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re -class HindustanTimes(BasicNewsRecipe): +class TelevisionWithoutPity(BasicNewsRecipe): title = u'Television Without Pity' language = 'en' - __author__ = 'Krittika Goyal' - oldest_article = 1 #days + __author__ = 'Snarkastica' + SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/' # Used for pulling down an entire show, not just the RSS feed + oldest_article = 7 # days max_articles_per_feed = 25 + # reverse_article_order=True # Useful for entire show, to display in episode order #encoding = 'cp1252' use_embedded_content = False + preprocess_regexps = [(re.compile(r'')] + keep_only_tags = [dict(name='span', attrs={'class':'headline_recap_title'}), dict( + name='p', attrs={'class':'byline'}), dict(name='div', attrs={'class':'body_recap'}), dict(name='h1')] no_stylesheets = True - auto_cleanup = True - #auto_cleanup_keep = '//div[@class="float_right"]' - + # Comment this out and configure process_index() to retrieve a single show feeds = [ -('News', +('Ltest Recaps', 'http://www.televisionwithoutpity.com/rss.xml'), ] + ''' + This method can be used to grab all recaps for a single show + Set the SHOW constant at the beginning of this file to the URL for a show's recap page + (the page listing all recaps, usually of the form: + http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/" + Where SHOW-NAME is the hyphenated name of the show. + + To use: + 1. Comment out feeds = [...] earlier in this file + 2. Set the SHOW constant to the show's recap page + 3. Uncomment the following function + ''' + + ''' + def parse_index(self): + soup = self.index_to_soup(self.SHOW) + feeds = [] + articles = [] + showTitle = soup.find('h1').string + recaps = soup.find('table') + for ep in recaps.findAll('tr'): + epData = ep.findAll('td') + epNum = epData[0].find(text=True).strip() + if not epNum == "Ep.": + epT = self.tag_to_string(epData[1].find('em')).strip() + epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")" + epTitle = epNum + ": " + epT + epST + epData[1].find('em').extract() + epURL = epData[1].find('a', href=True) + epURL = epURL['href'] + epSum = self.tag_to_string(epData[1].find('p')).strip() + epDate = epData[2].find(text=True).strip() + epAuthor = self.tag_to_string(epData[4].find('p')).strip() + articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor}) + feeds.append((showTitle, articles)) + #self.abort_recipe_processing("test") + return feeds + ''' + + # This will add subsequent pages of multipage recaps to a single article page + def append_page(self, soup, appendtag, position): + if (soup.find('p',attrs={'class':'pages'})): # If false, will still grab single-page recaplets + pager = soup.find('p',attrs={'class':'pages'}).find(text='Next') + if pager: + nexturl = pager.parent['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'body_recap'}) + for it in texttag.findAll(style=True): + del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + texttag.extract() + appendtag.insert(position,texttag) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) + return soup + + # Remove the multi page links (we had to keep these in for append_page(), but they can go away now + # Could have used CSS to hide, but some readers ignore CSS. + def postprocess_html(self, soup, first_fetch): + paginator = soup.findAll('p', attrs={'class':'pages'}) + if paginator: + for p in paginator: + p.extract() + + # TODO: Fix this so it converts the headline class into a heading 1 + #titleTag = Tag(soup, "h1") + #repTag = soup.find('span', attrs={'class':'headline_recap_title'}) + #titleTag.insert(0, repTag.contents[0]) + # repTag.extract() + #soup.body.insert(1, titleTag) + return soup