Update Television Without Pity

2025-12-19 19:45:01 -05:00 · 2013-10-14 10:17:04 +05:30 · 2013-10-14 10:17:04 +05:30 · ecf1e290b8
commit ecf1e290b8
parent 38a4942a6e
1 changed files with 84 additions and 7 deletions
--- a/recipes/television_without_pity.recipe
+++ b/recipes/television_without_pity.recipe
@ -1,21 +1,98 @@
 from calibre.web.feeds.news import BasicNewsRecipe
+import re

-class HindustanTimes(BasicNewsRecipe):
+class TelevisionWithoutPity(BasicNewsRecipe):
    title          = u'Television Without Pity'
    language       = 'en'
-    __author__     = 'Krittika Goyal'
-    oldest_article = 1 #days
+    __author__     = 'Snarkastica'
+    SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/'  # Used for pulling down an entire show, not just the RSS feed
+    oldest_article = 7  # days
    max_articles_per_feed = 25
+    # reverse_article_order=True # Useful for entire show, to display in episode order
    #encoding = 'cp1252'
    use_embedded_content = False

+    preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>', re.DOTALL|re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
+    keep_only_tags = [dict(name='span', attrs={'class':'headline_recap_title'}), dict(
+        name='p', attrs={'class':'byline'}), dict(name='div', attrs={'class':'body_recap'}), dict(name='h1')]
    no_stylesheets = True
-    auto_cleanup = True
-    #auto_cleanup_keep = '//div[@class="float_right"]'
-

+    # Comment this out and configure process_index() to retrieve a single show
    feeds          = [
-('News',
+('Ltest Recaps',
 'http://www.televisionwithoutpity.com/rss.xml'),
 ]

+    '''
+    This method can be used to grab all recaps for a single show
+    Set the SHOW constant at the beginning of this file to the URL for a show's recap page
+    (the page listing all recaps, usually of the form:
+    http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
+    Where SHOW-NAME is the hyphenated name of the show.
+
+    To use:
+    1. Comment out feeds = [...] earlier in this file
+    2. Set the SHOW constant to the show's recap page
+    3. Uncomment the following function
+    '''
+
+    '''
+    def parse_index(self):
+        soup = self.index_to_soup(self.SHOW)
+        feeds = []
+        articles = []
+        showTitle = soup.find('h1').string
+        recaps = soup.find('table')
+        for ep in recaps.findAll('tr'):
+            epData = ep.findAll('td')
+            epNum = epData[0].find(text=True).strip()
+            if not epNum == "Ep.":
+                epT = self.tag_to_string(epData[1].find('em')).strip()
+                epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
+                epTitle = epNum + ": " + epT + epST
+                epData[1].find('em').extract()
+                epURL = epData[1].find('a', href=True)
+                epURL = epURL['href']
+                epSum = self.tag_to_string(epData[1].find('p')).strip()
+                epDate = epData[2].find(text=True).strip()
+                epAuthor = self.tag_to_string(epData[4].find('p')).strip()
+                articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
+        feeds.append((showTitle, articles))
+        #self.abort_recipe_processing("test")
+        return feeds
+    '''
+
+    # This will add subsequent pages of multipage recaps to a single article page
+    def append_page(self, soup, appendtag, position):
+        if (soup.find('p',attrs={'class':'pages'})):  # If false, will still grab single-page recaplets
+            pager = soup.find('p',attrs={'class':'pages'}).find(text='Next')
+            if pager:
+                nexturl = pager.parent['href']
+                soup2 = self.index_to_soup(nexturl)
+                texttag = soup2.find('div', attrs={'class':'body_recap'})
+                for it in texttag.findAll(style=True):
+                    del it['style']
+                newpos = len(texttag.contents)
+                self.append_page(soup2,texttag,newpos)
+                texttag.extract()
+                appendtag.insert(position,texttag)
+
+    def preprocess_html(self, soup):
+        self.append_page(soup, soup.body, 3)
+        return soup
+
+    # Remove the multi page links (we had to keep these in for append_page(), but they can go away now
+    # Could have used CSS to hide, but some readers ignore CSS.
+    def postprocess_html(self, soup, first_fetch):
+        paginator = soup.findAll('p', attrs={'class':'pages'})
+        if paginator:
+            for p in paginator:
+                p.extract()
+
+                # TODO: Fix this so it converts the headline class into a heading 1
+        #titleTag = Tag(soup, "h1")
+        #repTag = soup.find('span', attrs={'class':'headline_recap_title'})
+        #titleTag.insert(0, repTag.contents[0])
+        # repTag.extract()
+        #soup.body.insert(1, titleTag)
+        return soup