calibre/recipes/kidney.recipe

# -*- coding: utf-8 -*-

import time

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup


class JASN(BasicNewsRecipe):
    title = u'Journal of the American Society of Nephrology'
    language = 'en'
    __author__ = 'Krittika Goyal'
    oldest_article = 31  # days
    max_articles_per_feed = 25
    delay = 5
    needs_subscription = True

    INDEX = 'http://jasn.asnjournals.org/current.shtml'
    no_stylesheets = True
    remove_tags_before = dict(name='h2')
    remove_tags = [
        dict(name='iframe'),
        dict(name='td', attrs={'id': ['jasnFooter']}),
        dict(name='table', attrs={'id': "jasnNavBar"}),
        dict(name='table', attrs={'class': 'content_box_outer_table'}),
        dict(name='th', attrs={'align': 'left'})
    ]

    # TO LOGIN
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        self.kidney_toc_soup = BeautifulSoup(br.open(self.INDEX).read())
        toc = self.kidney_toc_soup.find(id='tocTable')
        t = toc.find(text=lambda x: x and '[Full&nbsp;Text]' in x)
        a = t.findParent('a', href=True)
        url = a.get('href')
        if url.startswith('/'):
            url = 'http://jasn.asnjournals.org' + url
        br.open(url)
        br.select_form(name='UserSignIn')
        br['username'] = self.username
        br['code'] = self.password
        response = br.submit()
        raw = response.read()
        if b'Sign Out' not in raw:
            raise ValueError('Failed to log in, is your account expired?')
        return br

    # TO GET ARTICLE TOC
    def jasn_get_index(self):
        return self.index_to_soup('http://jasn.asnjournals.org/current.shtml')

    # To parse artice toc
    def parse_index(self):
        parse_soup = self.jasn_get_index()

        div = parse_soup.find(id='tocBody')

        current_section = None
        current_articles = []
        feeds = []
        for x in div.findAll(True):
            if x.name == 'h2':
                # Section heading found
                if current_articles and current_section:
                    feeds.append((current_section, current_articles))
                current_section = self.tag_to_string(x)
                current_articles = []
                self.log('\tFound section:', current_section)
            if current_section is not None and x.name == 'strong':
                title = self.tag_to_string(x)
                a = x.parent.parent.find(
                    'a', href=lambda x: x and '/full/' in x)
                if a is None:
                    continue
                url = a.get('href', False)
                if not url or not title:
                    continue
                if url.startswith('/'):
                    url = 'http://jasn.asnjournals.org' + url
                self.log('\t\tFound article:', title)
                self.log('\t\t\t', url)
                current_articles.append({'title': title, 'url': url,
                                         'description': '', 'date': ''})

        if current_articles and current_section:
            feeds.append((current_section, current_articles))

        return feeds

    def preprocess_html(self, soup):
        for a in soup.findAll(text=lambda x: x and '[in this window]' in x):
            a = a.findParent('a')
            url = a.get('href', None)
            if not url:
                continue
            if url.startswith('/'):
                url = 'http://jasn.asnjournals.org' + url
                img = isoup = None
                try:
                    isoup = self.index_to_soup(url)
                except:
                    time.sleep(5)
                    try:
                        isoup = self.index_to_soup(url)
                    except:
                        continue
                img = isoup.find(
                    'img', src=lambda x: x and x.startswith('/content/'))

            if img is not None:
                img.extract()
                table = a.findParent('table')
                table.replaceWith(img)
        return soup