calibre/recipes/wapo_cartoons.recipe

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import date, timedelta


class WaPoCartoonsRecipe(BasicNewsRecipe):
    __license__ = 'GPL v3'
    __author__ = 'kwetal'
    language = 'en'
    version = 2

    title = u'Washington Post Cartoons'
    publisher = u'Washington Post'
    category = u'News, Cartoons'
    description = u'Cartoons from the Washington Post'

    oldest_article = 7
    max_articles_per_feed = 100
    use_embedded_content = False
    no_stylesheets = True

    feeds = []
    feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
    feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
    feeds.append(
        (u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
    feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
    feeds.append(
        (u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
    feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
    feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
    feeds.append(
        (u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
    feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))

    extra_css = '''
                body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
                h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
                #name {margin-bottom: 0.2em}
                #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
                '''

    def parse_index(self):
        index = []
        oldestDate = date.today() - timedelta(days=self.oldest_article)
        oldest = oldestDate.strftime('%Y%m%d')
        for feed in self.feeds:
            cartoons = []
            soup = self.index_to_soup(feed[1])

            cartoon = {'title': 'Current', 'date': None,
                       'url': feed[1], 'description': ''}
            cartoons.append(cartoon)

            select = soup.find('select', attrs={'name': ['url', 'dest']})
            if select:
                cartoonCandidates = []
                if select['name'] == 'url':
                    cartoonCandidates = self.cartoonCandidatesWaPo(
                        select, oldest)
                else:
                    cartoonCandidates = self.cartoonCandidatesCreatorsCom(
                        select, oldest)

                for cartoon in cartoonCandidates:
                    cartoons.append(cartoon)

            index.append([feed[0], cartoons])

        return index

    def preprocess_html(self, soup):
        freshSoup = self.getFreshSoup(soup)

        div = soup.find('div', attrs={'id': 'name'})
        if div:
            freshSoup.body.append(div)
            comic = soup.find('div', attrs={'id': 'comic_full'})

            img = comic.find('img')
            if '&' in img['src']:
                img['src'], sep, bad = img['src'].rpartition('&')

            freshSoup.body.append(comic)
            freshSoup.body.append(soup.find('div', attrs={'id': 'copyright'}))
        else:
            span = soup.find('span', attrs={'class': 'title'})
            if span:
                del span['class']
                span['id'] = 'name'
                span.name = 'div'
                freshSoup.body.append(span)

            img = soup.find('img', attrs={'class': 'pic_big'})
            if img:
                td = img.parent
                td['style'] = ''
                del td['style']
                td.name = 'div'
                td['id'] = 'comic_full'
                freshSoup.body.append(td)

            td = soup.find('td', attrs={'class': 'copy'})
            if td:
                for a in td.find('a'):
                    a.extract()
                del td['class']
                td['id'] = 'copyright'
                td.name = 'div'
                freshSoup.body.append(td)

        return freshSoup

    def getFreshSoup(self, oldSoup):
        freshSoup = BeautifulSoup(
            '<html><head><title></title></head><body></body></html>')
        if oldSoup.head.title:
            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
        return freshSoup

    def cartoonCandidatesWaPo(self, select, oldest):
        opts = select.findAll('option')
        for i in range(1, len(opts)):
            url = opts[i]['value'].rstrip('/')
            dateparts = url.split('/')[-3:]
            datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
            if datenum >= oldest:
                yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
            else:
                return

    def cartoonCandidatesCreatorsCom(self, select, oldest):
        monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
                      'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
                      'November': '11', 'December': '12'}

        opts = select.findAll('option', selected=False)
        for i in range(1, len(opts)):
            dateString = self.tag_to_string(opts[i])
            rest, sep, year = dateString.rpartition(', ')
            parts = rest.split(' ')
            day = parts[2].rjust(2, '0')
            month = monthNames[parts[1]]
            datenum = str(year) + month + str(day)
            if datenum >= oldest:
                yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
            else:
                return