mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Television Without Pity
This commit is contained in:
parent
38a4942a6e
commit
ecf1e290b8
@ -1,21 +1,98 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
class HindustanTimes(BasicNewsRecipe):
|
class TelevisionWithoutPity(BasicNewsRecipe):
|
||||||
title = u'Television Without Pity'
|
title = u'Television Without Pity'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = 'Krittika Goyal'
|
__author__ = 'Snarkastica'
|
||||||
oldest_article = 1 #days
|
SHOW = 'http://www.televisionwithoutpity.com/show/SHOW-NAME-HERE/recaps/' # Used for pulling down an entire show, not just the RSS feed
|
||||||
|
oldest_article = 7 # days
|
||||||
max_articles_per_feed = 25
|
max_articles_per_feed = 25
|
||||||
|
# reverse_article_order=True # Useful for entire show, to display in episode order
|
||||||
#encoding = 'cp1252'
|
#encoding = 'cp1252'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(r'<span class="headline_recap_title .*?>', re.DOTALL|re.IGNORECASE), lambda match: '<span class="headline_recap_title">')]
|
||||||
|
keep_only_tags = [dict(name='span', attrs={'class':'headline_recap_title'}), dict(
|
||||||
|
name='p', attrs={'class':'byline'}), dict(name='div', attrs={'class':'body_recap'}), dict(name='h1')]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
|
||||||
#auto_cleanup_keep = '//div[@class="float_right"]'
|
|
||||||
|
|
||||||
|
|
||||||
|
# Comment this out and configure process_index() to retrieve a single show
|
||||||
feeds = [
|
feeds = [
|
||||||
('News',
|
('Ltest Recaps',
|
||||||
'http://www.televisionwithoutpity.com/rss.xml'),
|
'http://www.televisionwithoutpity.com/rss.xml'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
'''
|
||||||
|
This method can be used to grab all recaps for a single show
|
||||||
|
Set the SHOW constant at the beginning of this file to the URL for a show's recap page
|
||||||
|
(the page listing all recaps, usually of the form:
|
||||||
|
http://www.televisionwithoutpity.com/show/SHOW-NAME/recaps/"
|
||||||
|
Where SHOW-NAME is the hyphenated name of the show.
|
||||||
|
|
||||||
|
To use:
|
||||||
|
1. Comment out feeds = [...] earlier in this file
|
||||||
|
2. Set the SHOW constant to the show's recap page
|
||||||
|
3. Uncomment the following function
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.SHOW)
|
||||||
|
feeds = []
|
||||||
|
articles = []
|
||||||
|
showTitle = soup.find('h1').string
|
||||||
|
recaps = soup.find('table')
|
||||||
|
for ep in recaps.findAll('tr'):
|
||||||
|
epData = ep.findAll('td')
|
||||||
|
epNum = epData[0].find(text=True).strip()
|
||||||
|
if not epNum == "Ep.":
|
||||||
|
epT = self.tag_to_string(epData[1].find('em')).strip()
|
||||||
|
epST = " (or " + self.tag_to_string(epData[1].find('h3')).strip() + ")"
|
||||||
|
epTitle = epNum + ": " + epT + epST
|
||||||
|
epData[1].find('em').extract()
|
||||||
|
epURL = epData[1].find('a', href=True)
|
||||||
|
epURL = epURL['href']
|
||||||
|
epSum = self.tag_to_string(epData[1].find('p')).strip()
|
||||||
|
epDate = epData[2].find(text=True).strip()
|
||||||
|
epAuthor = self.tag_to_string(epData[4].find('p')).strip()
|
||||||
|
articles.append({'title':epTitle, 'url':epURL, 'description':epSum, 'date':epDate, 'author':epAuthor})
|
||||||
|
feeds.append((showTitle, articles))
|
||||||
|
#self.abort_recipe_processing("test")
|
||||||
|
return feeds
|
||||||
|
'''
|
||||||
|
|
||||||
|
# This will add subsequent pages of multipage recaps to a single article page
|
||||||
|
def append_page(self, soup, appendtag, position):
|
||||||
|
if (soup.find('p',attrs={'class':'pages'})): # If false, will still grab single-page recaplets
|
||||||
|
pager = soup.find('p',attrs={'class':'pages'}).find(text='Next')
|
||||||
|
if pager:
|
||||||
|
nexturl = pager.parent['href']
|
||||||
|
soup2 = self.index_to_soup(nexturl)
|
||||||
|
texttag = soup2.find('div', attrs={'class':'body_recap'})
|
||||||
|
for it in texttag.findAll(style=True):
|
||||||
|
del it['style']
|
||||||
|
newpos = len(texttag.contents)
|
||||||
|
self.append_page(soup2,texttag,newpos)
|
||||||
|
texttag.extract()
|
||||||
|
appendtag.insert(position,texttag)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body, 3)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
# Remove the multi page links (we had to keep these in for append_page(), but they can go away now
|
||||||
|
# Could have used CSS to hide, but some readers ignore CSS.
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
paginator = soup.findAll('p', attrs={'class':'pages'})
|
||||||
|
if paginator:
|
||||||
|
for p in paginator:
|
||||||
|
p.extract()
|
||||||
|
|
||||||
|
# TODO: Fix this so it converts the headline class into a heading 1
|
||||||
|
#titleTag = Tag(soup, "h1")
|
||||||
|
#repTag = soup.find('span', attrs={'class':'headline_recap_title'})
|
||||||
|
#titleTag.insert(0, repTag.contents[0])
|
||||||
|
# repTag.extract()
|
||||||
|
#soup.body.insert(1, titleTag)
|
||||||
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user