__license__ = 'GPL v3' __copyright__ = '2008-2013, Darko Miletic ' ''' tomshardware.com/us ''' import urllib from calibre.web.feeds.recipes import BasicNewsRecipe class Tomshardware(BasicNewsRecipe): title = "Tom's Hardware US" __author__ = 'Darko Miletic' description = 'Hardware reviews and News' publisher = "Tom's Hardware" category = 'news, IT, hardware, USA' no_stylesheets = True needs_subscription = 'optional' language = 'en' INDEX = 'http://www.tomshardware.com' LOGIN = INDEX + '/membres/' remove_javascript = True use_embedded_content= False conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX+'/us/') if self.username is not None and self.password is not None: data = urllib.urlencode({ 'action':'login_action' ,'r':self.INDEX+'/us/' ,'login':self.username ,'mdp':self.password }) br.open(self.LOGIN,data) return br remove_tags = [ dict(name='div' , attrs={'id':'header' }) ,dict(name='object') ] feeds = [ (u'Reviews', u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-2.xml') ,(u'News' , u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-1.xml') ] def print_version(self, url): main, sep, rest = url.rpartition('.html') rmain, rsep, article_id = main.rpartition(',') tmain, tsep, trest = rmain.rpartition('/reviews/') rind = 'http://www.tomshardware.com/news_print.php?p1=' if tsep: rind = 'http://www.tomshardware.com/review_print.php?p1=' return rind + article_id def cleanup_image_tags(self,soup): for item in soup.findAll('img'): for attrib in ['height','width','border','align']: if item.has_key(attrib): del item[attrib] return soup def preprocess_html(self, soup): del(soup.body['onload']) for item in soup.findAll(style=True): del item['style'] for it in soup.findAll('span'): it.name="div" return self.cleanup_image_tags(soup)