calibre/recipes/barrons.recipe

from calibre.web.feeds.news import BasicNewsRecipe

class Barrons(BasicNewsRecipe):

    title = 'Barron\'s'
    max_articles_per_feed = 50
    needs_subscription    = True
    language = 'en'

    __author__ = 'Kovid Goyal'
    description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
    timefmt  = ' [%a, %b %d, %Y]'
    use_embedded_content   = False
    no_stylesheets = True
    match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
    conversion_options = {'linearize_tables': True}
    ##delay = 1

    # Don't grab articles more than 7 days old
    oldest_article = 7
    use_javascript_to_login = True
    requires_version = (0, 9, 16)

    keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
    remove_tags = [
        dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
        dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
        dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]

    def javascript_login(self, br, username, password):
        br.visit('http://commerce.barrons.com/auth/login')
        f = br.select_form(nr=0)
        f['username'] = username
        f['password'] = password
        br.submit(timeout=120)

    # Use the print version of a page when available.
    def print_version(self, url):
        main, sep, rest = url.rpartition('?')
        return main + '#text.print'

    def preprocess_html(self, soup):
        # Remove thumbnail for zoomable images
        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
            img = div.find('img')
            if img is not None:
                img.extract()

        return soup

# Comment out the feeds you don't want retrieved.
# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire

    def get_feeds(self):
        return [
        ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
        ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
        ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
        ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
        ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
        ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
        ]

    def get_article_url(self, article):
        return article.get('link', None)

    def get_cover_url(self):
        cover_url = None
        index = 'http://online.barrons.com/home-page'
        soup = self.index_to_soup(index)
        link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
        if link_item:
            cover_url = link_item.img['src']
        return cover_url