calibre/recipes/kidney.recipe

# -*- coding: utf-8 -*-

import time

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class JASN(BasicNewsRecipe):
    title          = u'Journal of the American Society of Nephrology'
    language       = 'en'
    __author__     = 'Krittika Goyal'
    oldest_article = 31 #days
    max_articles_per_feed = 25
    delay = 5
    needs_subscription = True

    INDEX = 'http://jasn.asnjournals.org/current.shtml'
    no_stylesheets = True
    remove_tags_before = dict(name='h2')
    #remove_tags_after  = dict(name='th', attrs={'align':'left'})
    remove_tags = [
        dict(name='iframe'),
       #dict(name='div', attrs={'class':'related-articles'}),
        dict(name='td', attrs={'id':['jasnFooter']}),
        dict(name='table', attrs={'id':"jasnNavBar"}),
        dict(name='table', attrs={'class':'content_box_outer_table'}),
        dict(name='th', attrs={'align':'left'})
       ]


    #TO LOGIN
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        self.kidney_toc_soup = BeautifulSoup(br.open(self.INDEX).read())
        toc = self.kidney_toc_soup.find(id='tocTable')
        t = toc.find(text=lambda x: x and '[Full&nbsp;Text]' in x)
        a = t.findParent('a', href=True)
        url = a.get('href')
        if url.startswith('/'):
            url = 'http://jasn.asnjournals.org'+url
        br.open(url)
        br.select_form(name='UserSignIn')
        br['username'] = self.username
        br['code'] = self.password
        response = br.submit()
        raw = response.read()
        if 'Sign Out' not in raw:
            raise ValueError('Failed to log in, is your account expired?')
        return br

    #feeds          = [
        #('JASN',
        #'http://jasn.asnjournals.org/rss/current.xml'),
    #]


    #TO GET ARTICLE TOC
    def jasn_get_index(self):
        return self.index_to_soup('http://jasn.asnjournals.org/current.shtml')

    # To parse artice toc
    def parse_index(self):
            parse_soup = self.jasn_get_index()

            div = parse_soup.find(id='tocBody')

            current_section = None
            current_articles = []
            feeds = []
            for x in div.findAll(True):
                if x.name == 'h2':
                    # Section heading found
                    if current_articles and current_section:
                        feeds.append((current_section, current_articles))
                    current_section = self.tag_to_string(x)
                    current_articles = []
                    self.log('\tFound section:', current_section)
                if current_section is not None and x.name == 'strong':
                    title = self.tag_to_string(x)
                    a = x.parent.parent.find('a', href=lambda x: x and '/full/' in x)
                    if a is None:
                        continue
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    if url.startswith('/'):
                        url = 'http://jasn.asnjournals.org'+url
                    self.log('\t\tFound article:', title)
                    self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url':url,
                        'description':'', 'date':''})

            if current_articles and current_section:
                feeds.append((current_section, current_articles))

            return feeds


    def preprocess_html(self, soup):
        for a in soup.findAll(text=lambda x: x and '[in this window]' in x):
            a = a.findParent('a')
            url = a.get('href', None)
            if not url:
                continue
            if url.startswith('/'):
                url = 'http://jasn.asnjournals.org'+url
                img = isoup = None
                try:
                    isoup = self.index_to_soup(url)
                except:
                    time.sleep(5)
                    try:
                        isoup = self.index_to_soup(url)
                    except:
                        continue
                img = isoup.find('img', src=lambda x: x and x.startswith('/content/'))

            if img is not None:
                img.extract()
                table = a.findParent('table')
                table.replaceWith(img)
        return soup