calibre/recipes/espn.recipe

#!/usr/bin/env  python2
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

'''
espn.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import TemporaryFile


class ESPN(BasicNewsRecipe):

    title = 'ESPN'
    description = 'Sports news'
    __author__ = 'Kovid Goyal and Sujata Raman'
    language = 'en'
    no_stylesheets = True

    use_embedded_content = False
    remove_javascript = True
    needs_subscription = 'optional'
    encoding = 'ISO-8859-1'

    remove_tags_before = dict(name='font', attrs={'class': 'date'})
    center_navbar = False
    remove_tags = [
        dict(name='font', attrs={'class': 'footer'}), dict(
            name='hr', noshade='noshade'),
        dict(name='img', src='/winnercomm/horseracing/DRF.jpg')
    ]

    extra_css = '''
                body{font-family:Verdana,Arial,Helvetica,sans-serif; font-size:x-small; font-weight:normal;}
                .subhead{color:#666666;font-family:Verdana,sans-serif; font-size:x-small; font-weight:bold;}
                .clearfix{font-family:Verdana,sans-serif; font-size:xx-small; }
                .date{ font-family:Verdana,Arial,Helvetica,sans-serif ; font-size:xx-small;color:#7A7A7A;}
                .byline{ font-family:Verdana,Arial,Helvetica,sans-serif ; font-size:xx-small;color:#666666;}
                .headline{font-family:Verdana,Arial,Helvetica,sans-serif ; font-size:large; font-weight:bold;}
                '''

    feeds = [
        ('Top Headlines', 'http://sports.espn.go.com/espn/rss/news'),
        'http://sports.espn.go.com/espn/rss/nfl/news',
        'http://sports.espn.go.com/espn/rss/nba/news',
        'http://sports.espn.go.com/espn/rss/mlb/news',
        'http://sports.espn.go.com/espn/rss/nhl/news',
        'http://sports.espn.go.com/espn/rss/golf/news',
        'http://sports.espn.go.com/espn/rss/rpm/news',
        'http://sports.espn.go.com/espn/rss/tennis/news',
        'http://sports.espn.go.com/espn/rss/boxing/news',
        'http://soccernet.espn.go.com/rss/news',
        'http://sports.espn.go.com/espn/rss/ncb/news',
        'http://sports.espn.go.com/espn/rss/ncf/news',
        'http://sports.espn.go.com/espn/rss/ncaa/news',
        'http://sports.espn.go.com/espn/rss/outdoors/news',
        # 'http://sports.espn.go.com/espn/rss/bassmaster/news',
        'http://sports.espn.go.com/espn/rss/oly/news',
        'http://sports.espn.go.com/espn/rss/horse/news'
    ]

    def preprocess_html(self, soup):
        for div in soup.findAll('div'):
            if div.has_key('style') and 'px' in div['style']:  # noqa
                div['style'] = ''

        return soup

    def postprocess_html(self, soup, first_fetch):
        for div in soup.findAll('div', style=True):
            div['style'] = div['style'].replace('center', 'left')

        return soup

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
            br.set_handle_refresh(False)
            url = ('https://r.espn.go.com/members/v3_1/login')
            raw = br.open(url).read()
            raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
            with TemporaryFile(suffix='.htm') as fname:
                with open(fname, 'wb') as f:
                    f.write(raw)
                br.open_local_file(fname)

            br.form = list(br.forms())[0]
            br.form.find_control(
                name='username', type='text').value = self.username
            br.form['password'] = self.password
            br.submit().read()
            br.open('http://espn.go.com').read()
            br.set_handle_refresh(True)
        return br

    def get_article_url(self, article):
        return article.get('guid',  None)

    def print_version(self, url):
        if 'eticket' in url:
            return url.partition('&')[0].replace('story?', 'print?')
        match = re.search(r'story\?(id=\d+)', url)

        if match and 'soccernet' not in url and 'bassmaster' not in url:
            return 'http://sports.espn.go.com/espn/print?' + match.group(1) + '&type=story'
        else:
            if 'soccernet' in url:
                match = re.search(r'/id/(\d+)/', url)
                if match:
                    return \
                        'http://soccernet.espn.go.com/print?id=%s&type=story' % match.group(
                            1)
            # else:
            #    if 'bassmaster' in url:
            #        return url

        return None