calibre/recipes/philly.recipe

#!/usr/bin/env  python
__license__   = 'GPL v3'
'''
philly.com/inquirer/
'''
from calibre.web.feeds.recipes import BasicNewsRecipe

class Philly(BasicNewsRecipe):

    title       = 'Philadelphia Inquirer'
    __author__  = 'RadikalDissent and Sujata Raman'
    language = 'en'
    description = 'Daily news from the Philadelphia Inquirer'
    no_stylesheets        = True
    use_embedded_content  = False
    oldest_article = 1
    max_articles_per_feed = 25

    extra_css = '''
        h1{font-family:verdana,arial,helvetica,sans-serif; font-size: large;}
        h2{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
        .body-content{font-family:verdana,arial,helvetica,sans-serif; font-size: small;}
        .byline {font-size: small; color: #666666; font-style:italic; }
        .lastline {font-size: small; color: #666666; font-style:italic;}
        .contact {font-size: small; color: #666666;}
        .contact p {font-size: small; color: #666666;}
        #photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;}
        .photoCaption { font-family:verdana,arial,helvetica,sans-serif; font-size:x-small;}
        #photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;}
        .photoCredit{ font-family:verdana,arial,helvetica,sans-serif; font-size:x-small; color:#666666;}
        .article_timestamp{font-size:x-small; color:#666666;}
        a {font-family:verdana,arial,helvetica,sans-serif; font-size: x-small;}
                '''

    keep_only_tags = [
               dict(name='div', attrs={'class':'story-content'}),
               dict(name='div', attrs={'id': 'contentinside'})
                    ]

    remove_tags = [
         dict(name='div', attrs={'class':['linkssubhead','post_balloon','relatedlist','pollquestion','b_sq']}),
         dict(name='dl', attrs={'class':'relatedlist'}),
        dict(name='div', attrs={'id':['photoNav','sidebar_adholder']}),
        dict(name='a', attrs={'class': ['headlineonly','bl']}),
         dict(name='img', attrs={'class':'img_noborder'})
    ]
   # def print_version(self, url):
   #     return url + '?viewAll=y'


    feeds = [
        ('Front Page', 'http://www.philly.com/inquirer_front_page.rss'),
        ('Business', 'http://www.philly.com/inq_business.rss'),
        #('News', 'http://www.philly.com/inquirer/news/index.rss'),
        ('Nation', 'http://www.philly.com/inq_news_world_us.rss'),
        ('Local', 'http://www.philly.com/inquirer_local.rss'),
        ('Health', 'http://www.philly.com/inquirer_health_science.rss'),
        ('Education', 'http://www.philly.com/inquirer_education.rss'),
        ('Editorial and opinion', 'http://www.philly.com/inq_news_editorial.rss'),
        ('Sports', 'http://www.philly.com/inquirer_sports.rss')
        ]

    def get_article_url(self, article):
        ans = article.link

        try:
            self.log('Looking for full story link in', ans)
            soup = self.index_to_soup(ans)
            x = soup.find(text="View All")

            if x is not None:
                ans = ans + '?viewAll=y'
                self.log('Found full story link', ans)
        except:
            pass
        return ans

    def postprocess_html(self, soup,first):

         for tag in soup.findAll(name='div',attrs={'class':"container_ate_qandatitle"}):
                tag.extract()
         for tag in soup.findAll(name='br'):
                tag.extract()

         return soup