mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-24 23:38:55 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			71 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			71 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env  python
 | |
| 
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| 
 | |
| class PCLab(BasicNewsRecipe):
 | |
|     cover_url             = 'http://pclab.pl/img/logo.png'
 | |
|     title                 = u"PC Lab"
 | |
|     __author__            = 'ravcio - rlelusz[at]gmail.com'
 | |
|     description           = u"Articles from PC Lab website"
 | |
|     language              = 'pl'
 | |
|     oldest_article        = 30.0
 | |
|     max_articles_per_feed = 100
 | |
|     recursions            = 0
 | |
|     encoding              = 'iso-8859-2'
 | |
|     no_stylesheets        = True
 | |
|     remove_javascript     = True
 | |
|     use_embedded_content  = False
 | |
| 
 | |
|     keep_only_tags = [
 | |
|             dict(name='div', attrs={'class':['substance']})
 | |
|                      ]
 | |
| 
 | |
|     remove_tags = [
 | |
|             dict(name='div', attrs={'class':['chapters']})
 | |
|             ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
 | |
|                   ]
 | |
| 
 | |
|     remove_tags_after = [
 | |
|             dict(name='div', attrs={'class':['navigation']})
 | |
|                 ]
 | |
| 
 | |
|     #links to RSS feeds
 | |
|     feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
 | |
| 
 | |
|     #load second and subsequent page content
 | |
|     # in: soup - full page with 'next' button
 | |
|     # out: appendtag - tag to which new page is to be added
 | |
|     def append_page(self, soup, appendtag):
 | |
|         # find the 'Next' button
 | |
|         pager = soup.find('div', attrs={'class':'next'})
 | |
| 
 | |
|         if pager:
 | |
|             #search for 'a' element with link to next page (exit if not found)
 | |
|             a = pager.find('a')
 | |
|             if a:
 | |
|                 nexturl = a['href']
 | |
| 
 | |
|                 soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
 | |
| 
 | |
|                 pagetext_substance = soup2.find('div', attrs={'class':'substance'})
 | |
|                 pagetext = pagetext_substance.find('div', attrs={'class':'data'})
 | |
|                 pagetext.extract()
 | |
| 
 | |
|                 pos = len(appendtag.contents)
 | |
|                 appendtag.insert(pos, pagetext)
 | |
|                 pos = len(appendtag.contents)
 | |
| 
 | |
|                 self.append_page(soup2, appendtag)
 | |
| 
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
| 
 | |
|         # soup.body contains no title and no navigator, they are in soup
 | |
|         self.append_page(soup, soup.body)
 | |
| 
 | |
|         # finally remove some tags
 | |
|         tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
 | |
|         [tag.extract() for tag in tags]
 | |
| 
 | |
|         return soup
 |