mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-24 23:38:55 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			129 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			129 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| __license__   = 'GPL v3'
 | |
| __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
 | |
| '''
 | |
| arstechnica.com
 | |
| '''
 | |
| 
 | |
| import re
 | |
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 | |
| 
 | |
| class ArsTechnica(BasicNewsRecipe):
 | |
|     title                 = u'Ars Technica'
 | |
|     language              = 'en'
 | |
|     __author__            = 'Darko Miletic, Sujata Raman, Alexis Rohou'
 | |
|     description           = 'The art of technology'
 | |
|     publisher             = 'Ars Technica'
 | |
|     category              = 'news, IT, technology'
 | |
|     oldest_article        = 5
 | |
|     max_articles_per_feed = 100
 | |
|     no_stylesheets        = True
 | |
|     encoding              = 'utf-8'
 | |
|     use_embedded_content  = False
 | |
|     extra_css             = 	'''
 | |
| 				body {font-family: Arial,Helvetica,sans-serif}
 | |
| 				.title{text-align: left}
 | |
| 				.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
 | |
| 				.news-item-figure-caption-text{font-size:small; font-style:italic}
 | |
| 				.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
 | |
| 				'''
 | |
|     ignoreEtcArticles     = True	# Etc feed items can be ignored, as they're not real stories
 | |
| 
 | |
|     conversion_options = {
 | |
|                              'comments'  : description
 | |
|                             ,'tags'      : category
 | |
|                             ,'language'  : language
 | |
|                             ,'publisher' : publisher
 | |
|                          }
 | |
| 
 | |
| 
 | |
|     #preprocess_regexps = [
 | |
|     #            (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
 | |
|     #           ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
 | |
|     #                     ]
 | |
| 
 | |
|     keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
 | |
| 
 | |
|     remove_tags = [
 | |
|                      dict(name=['object','link','embed'])
 | |
|                     ,dict(name='div', attrs={'class':'read-more-link'})
 | |
|                   ]
 | |
|     #remove_attributes=['width','height']
 | |
| 
 | |
|     feeds = [
 | |
|               (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
 | |
|              ,(u'Opposable Thumbs (Gaming content)'    , u'http://feeds.arstechnica.com/arstechnica/gaming/'     )
 | |
|              ,(u'Gear and Gadgets'                     , u'http://feeds.arstechnica.com/arstechnica/gadgets/'    )
 | |
|              ,(u'Chipster (Hardware content)'          , u'http://feeds.arstechnica.com/arstechnica/hardware/'   )
 | |
|              ,(u'Uptime (IT content)'                  , u'http://feeds.arstechnica.com/arstechnica/business/'   )
 | |
|              ,(u'Open Ended (Open Source content)'     , u'http://feeds.arstechnica.com/arstechnica/open-source/')
 | |
|              ,(u'One Microsoft Way'                    , u'http://feeds.arstechnica.com/arstechnica/microsoft/'  )
 | |
|              ,(u'Nobel Intent (Science content)'       , u'http://feeds.arstechnica.com/arstechnica/science/'    )
 | |
|              ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
 | |
|             ]
 | |
| 
 | |
|     # This deals with multi-page stories
 | |
|     def append_page(self, soup, appendtag, position):
 | |
|         pager = soup.find('div',attrs={'class':'pager'})
 | |
|         if pager:
 | |
|            for atag in pager.findAll('a',href=True):
 | |
|                str = self.tag_to_string(atag)
 | |
|                if str.startswith('Next'):
 | |
|                   nurl = 'http://arstechnica.com' + atag['href']
 | |
|                   rawc = self.index_to_soup(nurl,True)
 | |
|                   soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
 | |
| 
 | |
|                   readmoretag = soup2.find('div', attrs={'class':'read-more-link'})
 | |
|                   if readmoretag:
 | |
|                      readmoretag.extract()
 | |
|                   texttag = soup2.find('div', attrs={'class':'body'})
 | |
|                   for it in texttag.findAll(style=True):
 | |
|                       del it['style']
 | |
| 
 | |
|                   newpos = len(texttag.contents)
 | |
|                   self.append_page(soup2,texttag,newpos)
 | |
|                   texttag.extract()
 | |
|                   pager.extract()
 | |
|                   appendtag.insert(position,texttag)
 | |
| 
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
| 	# Adds line breaks near the byline (not sure why this is needed)
 | |
|         ftag = soup.find('div', attrs={'class':'byline'})
 | |
|         if ftag:
 | |
|            brtag = Tag(soup,'br')
 | |
|            brtag2 = Tag(soup,'br')
 | |
|            ftag.insert(4,brtag)
 | |
|            ftag.insert(5,brtag2)
 | |
| 
 | |
| 	# Remove style items
 | |
|         for item in soup.findAll(style=True):
 | |
|            del item['style']
 | |
| 
 | |
| 	# Remove id
 | |
| 	for item in soup.findAll(id=True):
 | |
| 		del item['id']
 | |
| 
 | |
| 	# For some reason, links to authors don't have the domainname
 | |
| 	a_author = soup.find('a',{'href':re.compile("^/author")})
 | |
| 	if a_author:
 | |
| 		a_author['href'] = 'http://arstechnica.com'+a_author['href']
 | |
| 
 | |
| 	# within div class news-item-figure, we need to grab images
 | |
| 
 | |
| 	# Deal with multi-page stories
 | |
|         self.append_page(soup, soup.body, 3)
 | |
| 
 | |
|         return soup
 | |
| 
 | |
|     def get_article_url(self, article):
 | |
| 	# If the article title starts with Etc:, don't return it
 | |
| 	if self.ignoreEtcArticles:
 | |
| 		article_title = article.get('title',None)
 | |
| 		if re.match('Etc: ',article_title) is not None:
 | |
| 			return None
 | |
| 
 | |
| 	# The actual article is in a guid tag
 | |
|         return article.get('guid',  None).rpartition('?')[0]
 | |
| 
 |