diff --git a/resources/recipes/johm.recipe b/resources/recipes/johm.recipe index 6930f05a50..0f5625b806 100644 --- a/resources/recipes/johm.recipe +++ b/resources/recipes/johm.recipe @@ -1,78 +1,72 @@ -# -*- coding: utf-8 -*- - +import re from calibre.web.feeds.recipes import BasicNewsRecipe class JournalofHospitalMedicine(BasicNewsRecipe): title = 'Journal of Hospital Medicine' - __author__ = 'Krittika Goyal' + __author__ = 'Kovid Goyal' description = 'Medical news' timefmt = ' [%d %b, %Y]' needs_subscription = True language = 'en' no_stylesheets = True + keep_only_tags = [dict(id=['articleTitle', 'articleMeta', 'fulltext'])] + remove_tags = [dict(attrs={'class':'licensedContent'})] # TO LOGIN def get_browser(self): br = BasicNewsRecipe.get_browser() br.open('http://www3.interscience.wiley.com/cgi-bin/home') - br.select_form(name='siteLogin') - br['LoginName'] = self.username - br['Password'] = self.password + br.select_form(nr=0) + br['j_username'] = self.username + br['j_password'] = self.password response = br.submit() raw = response.read() - if 'userName = ""' in raw: + if '

LOGGED IN

' not in raw: raise Exception('Login failed. Check your username and password') return br #TO GET ARTICLE TOC def johm_get_index(self): - return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home') + return self.index_to_soup('http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1553-5606/currentissue') # To parse artice toc def parse_index(self): - parse_soup = self.johm_get_index() - - div = parse_soup.find(id='contentCell') - - current_section = None - current_articles = [] + soup = self.johm_get_index() + toc = soup.find(id='issueTocGroups') feeds = [] - for x in div.findAll(True): - if x.name == 'h4': - # Section heading found - if current_articles and current_section: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if current_section is not None and x.name == 'strong': - title = self.tag_to_string(x) - p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x) - if p is None: - continue - url = p.get('href', False) - if not url or not title: + for group in toc.findAll('li', id=re.compile(r'group\d+')): + gtitle = group.find(attrs={'class':'subSectionHeading'}) + if gtitle is None: + continue + gtitle = self.tag_to_string(gtitle) + arts = group.find(attrs={'class':'articles'}) + if arts is None: + continue + self.log('Found section:', gtitle) + articles = [] + for art in arts.findAll(attrs={'class':lambda x: x and 'tocArticle' + in x}): + a = art.find('a', href=True) + if a is None: continue + url = a.get('href') if url.startswith('/'): - url = 'http://www3.interscience.wiley.com'+url - url = url.replace('/HTMLSTART', '/main.html,ftx_abs') - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - #if url.startswith('/'): - #url = 'http://online.wsj.com'+url - current_articles.append({'title': title, 'url':url, - 'description':'', 'date':''}) - - if current_articles and current_section: - feeds.append((current_section, current_articles)) + url = 'http://onlinelibrary.wiley.com' + url + url = url.replace('/abstract', '/full') + title = self.tag_to_string(a) + a.extract() + pm = art.find(attrs={'class':'productMenu'}) + if pm is not None: + pm.extract() + desc = self.tag_to_string(art) + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title, 'url':url, 'description':desc, + 'date':''}) + if articles: + feeds.append((gtitle, articles)) return feeds - def preprocess_html(self, soup): - for img in soup.findAll('img', src=True): - img['src'] = img['src'].replace('tfig', 'nfig') - return soup - diff --git a/resources/recipes/nejm.recipe b/resources/recipes/nejm.recipe index a6580a5232..bc12fbcedf 100644 --- a/resources/recipes/nejm.recipe +++ b/resources/recipes/nejm.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class NYTimes(BasicNewsRecipe): title = 'New England Journal of Medicine' - __author__ = 'Krittika Goyal' + __author__ = 'Kovid Goyal' description = 'Medical news' timefmt = ' [%d %b, %Y]' needs_subscription = True