#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008-2009, Darko Miletic ' ''' tomshardware.com/us ''' import urllib from calibre.web.feeds.recipes import BasicNewsRecipe class Tomshardware(BasicNewsRecipe): title = "Tom's Hardware US" __author__ = 'Darko Miletic' description = 'Hardware reviews and News' publisher = "Tom's Hardware" category = 'news, IT, hardware, USA' no_stylesheets = True needs_subscription = True language = 'en' INDEX = 'http://www.tomshardware.com' LOGIN = INDEX + '/membres/' remove_javascript = True use_embedded_content= False html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher ] html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' def get_browser(self): br = BasicNewsRecipe.get_browser() br.open(self.INDEX+'/us/') if self.username is not None and self.password is not None: data = urllib.urlencode({ 'action':'login_action' ,'r':self.INDEX+'/us/' ,'login':self.username ,'mdp':self.password }) br.open(self.LOGIN,data) return br remove_tags = [ dict(name='div' , attrs={'id':'header' }) ,dict(name='object') ] feeds = [ (u'Latest Articles', u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-2.xml' ) ,(u'Latest News' , u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-1.xml') ] def print_version(self, url): main, sep, rest = url.rpartition('.html') rmain, rsep, article_id = main.rpartition(',') tmain, tsep, trest = rmain.rpartition('/reviews/') rind = 'http://www.tomshardware.com/news_print.php?p1=' if tsep: rind = 'http://www.tomshardware.com/review_print.php?p1=' return rind + article_id def cleanup_image_tags(self,soup): for item in soup.findAll('img'): for attrib in ['height','width','border','align']: if item.has_key(attrib): del item[attrib] return soup def preprocess_html(self, soup): del(soup.body['onload']) for item in soup.findAll(style=True): del item['style'] for it in soup.findAll('span'): it.name="div" return self.cleanup_image_tags(soup)