mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-30 18:22:25 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			54 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			54 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | |
| 
 | |
| class SmithsonianMagazine(BasicNewsRecipe):
 | |
|     title          = u'Smithsonian Magazine'
 | |
|     language       = 'en'
 | |
|     __author__     = 'Krittika Goyal'
 | |
|     oldest_article = 31#days
 | |
|     max_articles_per_feed = 50
 | |
|     use_embedded_content = False
 | |
|     #encoding = 'latin1'
 | |
|     recursions = 1
 | |
|     match_regexps = ['&page=[2-9]$']
 | |
| 
 | |
|     remove_stylesheets = True
 | |
|     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
 | |
|     remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
 | |
|     remove_tags = [
 | |
|        dict(name='iframe'),
 | |
|        dict(name='div', attrs={'class':'article_sidebar_border'}),
 | |
|        dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
 | |
|        ##dict(name='ul', attrs={'class':'article-tools'}),
 | |
|        dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
 | |
|     ]
 | |
| 
 | |
| 
 | |
|     feeds          = [
 | |
| ('History and Archeology',
 | |
|  'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
 | |
| ('People and Places',
 | |
|  'http://feeds.feedburner.com/smithsonianmag/people-places'),
 | |
| ('Science and Nature',
 | |
|  'http://feeds.feedburner.com/smithsonianmag/science-nature'),
 | |
| ('Arts and Culture',
 | |
|  'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
 | |
| ('Travel',
 | |
|  'http://feeds.feedburner.com/smithsonianmag/travel'),
 | |
| ]
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         story = soup.find(name='div', attrs={'id':'article-body'})
 | |
|         ##td = heading.findParent(name='td')
 | |
|         ##td.extract()
 | |
|         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
 | |
|         body = soup.find(name='body')
 | |
|         body.insert(0, story)
 | |
|         return soup
 | |
| 
 | |
|     #def postprocess_html(self, soup, first):
 | |
|         #for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
 | |
|         #if not first:
 | |
|              #for div in soup.findAll(id='article-head'): div.extract()
 | |
|         #return soup
 |