mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-24 23:38:55 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			116 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			116 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| # -*- coding: utf-8 -*-
 | |
| 
 | |
| import time
 | |
| 
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | |
| 
 | |
| 
 | |
| class JASN(BasicNewsRecipe):
 | |
|     title = u'Journal of the American Society of Nephrology'
 | |
|     language = 'en'
 | |
|     __author__ = 'Krittika Goyal'
 | |
|     oldest_article = 31  # days
 | |
|     max_articles_per_feed = 25
 | |
|     delay = 5
 | |
|     needs_subscription = True
 | |
| 
 | |
|     INDEX = 'http://jasn.asnjournals.org/current.shtml'
 | |
|     no_stylesheets = True
 | |
|     remove_tags_before = dict(name='h2')
 | |
|     remove_tags = [
 | |
|         dict(name='iframe'),
 | |
|         dict(name='td', attrs={'id': ['jasnFooter']}),
 | |
|         dict(name='table', attrs={'id': "jasnNavBar"}),
 | |
|         dict(name='table', attrs={'class': 'content_box_outer_table'}),
 | |
|         dict(name='th', attrs={'align': 'left'})
 | |
|     ]
 | |
| 
 | |
|     # TO LOGIN
 | |
|     def get_browser(self):
 | |
|         br = BasicNewsRecipe.get_browser(self)
 | |
|         self.kidney_toc_soup = BeautifulSoup(br.open(self.INDEX).read())
 | |
|         toc = self.kidney_toc_soup.find(id='tocTable')
 | |
|         t = toc.find(text=lambda x: x and '[Full Text]' in x)
 | |
|         a = t.findParent('a', href=True)
 | |
|         url = a.get('href')
 | |
|         if url.startswith('/'):
 | |
|             url = 'http://jasn.asnjournals.org' + url
 | |
|         br.open(url)
 | |
|         br.select_form(name='UserSignIn')
 | |
|         br['username'] = self.username
 | |
|         br['code'] = self.password
 | |
|         response = br.submit()
 | |
|         raw = response.read()
 | |
|         if b'Sign Out' not in raw:
 | |
|             raise ValueError('Failed to log in, is your account expired?')
 | |
|         return br
 | |
| 
 | |
|     # TO GET ARTICLE TOC
 | |
|     def jasn_get_index(self):
 | |
|         return self.index_to_soup('http://jasn.asnjournals.org/current.shtml')
 | |
| 
 | |
|     # To parse artice toc
 | |
|     def parse_index(self):
 | |
|         parse_soup = self.jasn_get_index()
 | |
| 
 | |
|         div = parse_soup.find(id='tocBody')
 | |
| 
 | |
|         current_section = None
 | |
|         current_articles = []
 | |
|         feeds = []
 | |
|         for x in div.findAll(True):
 | |
|             if x.name == 'h2':
 | |
|                 # Section heading found
 | |
|                 if current_articles and current_section:
 | |
|                     feeds.append((current_section, current_articles))
 | |
|                 current_section = self.tag_to_string(x)
 | |
|                 current_articles = []
 | |
|                 self.log('\tFound section:', current_section)
 | |
|             if current_section is not None and x.name == 'strong':
 | |
|                 title = self.tag_to_string(x)
 | |
|                 a = x.parent.parent.find(
 | |
|                     'a', href=lambda x: x and '/full/' in x)
 | |
|                 if a is None:
 | |
|                     continue
 | |
|                 url = a.get('href', False)
 | |
|                 if not url or not title:
 | |
|                     continue
 | |
|                 if url.startswith('/'):
 | |
|                     url = 'http://jasn.asnjournals.org' + url
 | |
|                 self.log('\t\tFound article:', title)
 | |
|                 self.log('\t\t\t', url)
 | |
|                 current_articles.append({'title': title, 'url': url,
 | |
|                                          'description': '', 'date': ''})
 | |
| 
 | |
|         if current_articles and current_section:
 | |
|             feeds.append((current_section, current_articles))
 | |
| 
 | |
|         return feeds
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         for a in soup.findAll(text=lambda x: x and '[in this window]' in x):
 | |
|             a = a.findParent('a')
 | |
|             url = a.get('href', None)
 | |
|             if not url:
 | |
|                 continue
 | |
|             if url.startswith('/'):
 | |
|                 url = 'http://jasn.asnjournals.org' + url
 | |
|                 img = isoup = None
 | |
|                 try:
 | |
|                     isoup = self.index_to_soup(url)
 | |
|                 except:
 | |
|                     time.sleep(5)
 | |
|                     try:
 | |
|                         isoup = self.index_to_soup(url)
 | |
|                     except:
 | |
|                         continue
 | |
|                 img = isoup.find(
 | |
|                     'img', src=lambda x: x and x.startswith('/content/'))
 | |
| 
 | |
|             if img is not None:
 | |
|                 img.extract()
 | |
|                 table = a.findParent('table')
 | |
|                 table.replaceWith(img)
 | |
|         return soup
 |