From 3e96fde80b7d5cb99cc19f8d37cf168809aac757 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 10 Jan 2010 09:45:43 -0700 Subject: [PATCH] New recipe for Washington Post cartoons by kwetal --- resources/images/news/wapo_cartoons.png | Bin 0 -> 311 bytes resources/recipes/wapo_cartoons.recipe | 145 ++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 resources/images/news/wapo_cartoons.png create mode 100644 resources/recipes/wapo_cartoons.recipe diff --git a/resources/images/news/wapo_cartoons.png b/resources/images/news/wapo_cartoons.png new file mode 100644 index 0000000000000000000000000000000000000000..5722cf5416dd40def923fed57d0d29e39e000a02 GIT binary patch literal 311 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`jKx9jP7LeL$-D$|*pj^6T^Rm@ z;DWu&Cj&(|3p^r=85p>QL70(Y)*K0-AbW|YuPgg4MlK#!gL;X?4M3soo-U3d7N?g^ z_7*y#z~OT7)ZSUM7n&cQY^g2ea>nQC2CrRDK5o=4KIf@;Ok&BZ?qg?I6y%l)G1~3i zTK4*zu2Qx`WL4BKY!TEt#+d*AAuBdiPxdwE0eHkcjSbi}+P z^WMF*x0Cj^PBynWpcDJw(@*}gdG_&CImZ3R*d<#V92gXwR{_1i;OXk;vd$@?2>?$z Ba_Im7 literal 0 HcmV?d00001 diff --git a/resources/recipes/wapo_cartoons.recipe b/resources/recipes/wapo_cartoons.recipe new file mode 100644 index 0000000000..78440aa140 --- /dev/null +++ b/resources/recipes/wapo_cartoons.recipe @@ -0,0 +1,145 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from datetime import date, timedelta + +class WaPoCartoonsRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en' + version = 2 + + title = u'Washington Post Cartoons' + publisher = u'Washington Post' + category = u'News, Cartoons' + description = u'Cartoons from the Washington Post' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + no_stylesheets = True + + feeds = [] + feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/')) + feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/')) + feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb')) + feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/')) + feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk')) + feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/')) + feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/')) + feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/')) + feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/')) + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif;} + h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;} + #name {margin-bottom: 0.2em} + #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;} + ''' + + def parse_index(self): + index = [] + oldestDate = date.today() - timedelta(days = self.oldest_article) + oldest = oldestDate.strftime('%Y%m%d') + for feed in self.feeds: + cartoons = [] + soup = self.index_to_soup(feed[1]) + + cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''} + cartoons.append(cartoon) + + select = soup.find('select', attrs = {'name': ['url', 'dest']}) + if select: + cartoonCandidates = [] + if select['name'] == 'url': + cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest) + else: + cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest) + + for cartoon in cartoonCandidates: + cartoons.append(cartoon) + + index.append([feed[0], cartoons]) + + return index + + def preprocess_html(self, soup): + freshSoup = self.getFreshSoup(soup) + + div = soup.find('div', attrs = {'id': 'name'}) + if div: + freshSoup.body.append(div) + comic = soup.find('div', attrs = {'id': 'comic_full'}) + + img = comic.find('img') + if '&' in img['src']: + img['src'], sep, bad = img['src'].rpartition('&') + + freshSoup.body.append(comic) + freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'})) + else: + span = soup.find('span', attrs = {'class': 'title'}) + if span: + del span['class'] + span['id'] = 'name' + span.name = 'div' + freshSoup.body.append(span) + + img = soup.find('img', attrs = {'class': 'pic_big'}) + if img: + td = img.parent + if td.has_key('style'): + del td['style'] + td.name = 'div' + td['id'] = 'comic_full' + freshSoup.body.append(td) + + td = soup.find('td', attrs = {'class': 'copy'}) + if td: + for a in td.find('a'): + a.extract() + del td['class'] + td['id'] = 'copyright' + td.name = 'div' + freshSoup.body.append(td) + + return freshSoup + + def getFreshSoup(self, oldSoup): + freshSoup = BeautifulSoup('') + if oldSoup.head.title: + freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) + return freshSoup + + def cartoonCandidatesWaPo(self, select, oldest): + opts = select.findAll('option') + for i in range(1, len(opts)): + url = opts[i]['value'].rstrip('/') + dateparts = url.split('/')[-3:] + datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2]) + if datenum >= oldest: + yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''} + else: + return + + def cartoonCandidatesCreatorsCom(self, select, oldest): + monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', + 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', + 'November': '11', 'December': '12'} + + opts = select.findAll('option') + for i in range(1, len(opts)): + if opts[i].has_key('selected'): + continue + + dateString = self.tag_to_string(opts[i]) + rest, sep, year = dateString.rpartition(', ') + parts = rest.split(' ') + day = parts[2].rjust(2, '0') + month = monthNames[parts[1]] + datenum = str(year) + month + str(day) + if datenum >= oldest: + yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''} + else: + return + +