import re from calibre.web.feeds.recipes import BasicNewsRecipe class JournalofHospitalMedicine(BasicNewsRecipe): title = 'Journal of Hospital Medicine' __author__ = 'Kovid Goyal' description = 'Medical news' timefmt = ' [%d %b, %Y]' needs_subscription = True language = 'en' no_stylesheets = True keep_only_tags = [dict(id=['articleTitle', 'articleMeta', 'fulltext'])] remove_tags = [dict(attrs={'class':'licensedContent'})] # TO LOGIN def get_browser(self): br = BasicNewsRecipe.get_browser() br.open('http://www3.interscience.wiley.com/cgi-bin/home') br.select_form(nr=0) br['j_username'] = self.username br['j_password'] = self.password response = br.submit() raw = response.read() if '

LOGGED IN

' not in raw: raise Exception('Login failed. Check your username and password') return br #TO GET ARTICLE TOC def johm_get_index(self): return self.index_to_soup('http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1553-5606/currentissue') # To parse artice toc def parse_index(self): soup = self.johm_get_index() toc = soup.find(id='issueTocGroups') feeds = [] for group in toc.findAll('li', id=re.compile(r'group\d+')): gtitle = group.find(attrs={'class':'subSectionHeading'}) if gtitle is None: continue gtitle = self.tag_to_string(gtitle) arts = group.find(attrs={'class':'articles'}) if arts is None: continue self.log('Found section:', gtitle) articles = [] for art in arts.findAll(attrs={'class':lambda x: x and 'tocArticle' in x}): a = art.find('a', href=True) if a is None: continue url = a.get('href') if url.startswith('/'): url = 'http://onlinelibrary.wiley.com' + url url = url.replace('/abstract', '/full') title = self.tag_to_string(a) a.extract() pm = art.find(attrs={'class':'productMenu'}) if pm is not None: pm.extract() desc = self.tag_to_string(art) self.log('\tFound article:', title, 'at', url) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: feeds.append((gtitle, articles)) return feeds