# -*- coding: utf-8 -*- from calibre.web.feeds.recipes import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class NEJM(BasicNewsRecipe): title = 'New England Journal of Medicine' __author__ = 'Kovid Goyal' description = 'Medical news' timefmt = ' [%d %b, %Y]' needs_subscription = True language = 'en' no_stylesheets = True keep_only_tags = [ dict(id='content') ] remove_tags_after = dict(id='article_references') remove_attributes = ['width', 'height'] # TO LOGIN def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('https://www.nejm.org/action/showLogin?uri=http%3A%2F%2Fwww.nejm.org%2F') br.select_form(name='frmLogin') br['login'] = self.username br['password'] = self.password response = br.submit() raw = response.read() if '>Sign Out<' not in raw: raise Exception('Login failed. Check your username and password') return br # TO GET ARTICLE TOC def nejm_get_index(self): return self.index_to_soup('https://www.nejm.org/toc/nejm/medical-journal') # To parse artice toc def parse_index(self): soup = self.nejm_get_index() feeds = [] current_section = None articles = [] div = soup.find(**classes('pagefulltext')) for x in div.findAll(name=['h2', 'li']): if x.name == 'h2': if current_section and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).strip() articles = [] if current_section: self.log(current_section) else: a = x.find('a') if a is None: continue title = self.tag_to_string(a.find('strong')).strip() blurb = a.find(**classes('f-blurb')) desc = '' if blurb is not None: desc = self.tag_to_string(blurb) self.log('\t', title) articles.append({'title': title, 'url': 'https://www.nejm.org' + a['href'], 'description': desc}) if current_section and articles: feeds.append((current_section, articles)) return feeds