from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup from datetime import date, timedelta class WaPoCartoonsRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en' version = 2 title = u'Washington Post Cartoons' publisher = u'Washington Post' category = u'News, Cartoons' description = u'Cartoons from the Washington Post' oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True feeds = [] feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/')) feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/')) feeds.append( (u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb')) feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/')) feeds.append( (u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk')) feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/')) feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/')) feeds.append( (u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/')) feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/')) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif;} h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;} #name {margin-bottom: 0.2em} #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;} ''' def parse_index(self): index = [] oldestDate = date.today() - timedelta(days=self.oldest_article) oldest = oldestDate.strftime('%Y%m%d') for feed in self.feeds: cartoons = [] soup = self.index_to_soup(feed[1]) cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description': ''} cartoons.append(cartoon) select = soup.find('select', attrs={'name': ['url', 'dest']}) if select: cartoonCandidates = [] if select['name'] == 'url': cartoonCandidates = self.cartoonCandidatesWaPo( select, oldest) else: cartoonCandidates = self.cartoonCandidatesCreatorsCom( select, oldest) for cartoon in cartoonCandidates: cartoons.append(cartoon) index.append([feed[0], cartoons]) return index def preprocess_html(self, soup): freshSoup = self.getFreshSoup(soup) div = soup.find('div', attrs={'id': 'name'}) if div: freshSoup.body.append(div) comic = soup.find('div', attrs={'id': 'comic_full'}) img = comic.find('img') if '&' in img['src']: img['src'], sep, bad = img['src'].rpartition('&') freshSoup.body.append(comic) freshSoup.body.append(soup.find('div', attrs={'id': 'copyright'})) else: span = soup.find('span', attrs={'class': 'title'}) if span: del span['class'] span['id'] = 'name' span.name = 'div' freshSoup.body.append(span) img = soup.find('img', attrs={'class': 'pic_big'}) if img: td = img.parent td['style'] = '' del td['style'] td.name = 'div' td['id'] = 'comic_full' freshSoup.body.append(td) td = soup.find('td', attrs={'class': 'copy'}) if td: for a in td.find('a'): a.extract() del td['class'] td['id'] = 'copyright' td.name = 'div' freshSoup.body.append(td) return freshSoup def getFreshSoup(self, oldSoup): freshSoup = BeautifulSoup( '