__license__ = 'GPL v3' __copyright__ = '2008-2013, Darko Miletic ' ''' tomshardware.com/us ''' try: from urllib.parse import urlencode except ImportError: from urllib import urlencode from calibre.web.feeds.recipes import BasicNewsRecipe class Tomshardware(BasicNewsRecipe): title = "Tom's Hardware US" __author__ = 'Darko Miletic' description = 'Hardware reviews and News' publisher = "Tom's Hardware" category = 'news, IT, hardware, USA' no_stylesheets = True needs_subscription = 'optional' language = 'en' INDEX = 'http://www.tomshardware.com' LOGIN = INDEX + '/membres/' remove_javascript = True use_embedded_content = False conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX + '/us/') if self.username is not None and self.password is not None: data = urlencode({'action': 'login_action', 'r': self.INDEX + '/us/', 'login': self.username, 'mdp': self.password }) br.open(self.LOGIN, data) return br remove_tags = [ dict(name='div', attrs={'id': 'header'}), dict(name='object') ] feeds = [ (u'Reviews', u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-2.xml'), (u'News', u'http://www.tomshardware.com/feeds/rss2/tom-s-hardware-us,18-1.xml') ] def print_version(self, url): main, sep, rest = url.rpartition('.html') rmain, rsep, article_id = main.rpartition(',') tmain, tsep, trest = rmain.rpartition('/reviews/') rind = 'http://www.tomshardware.com/news_print.php?p1=' if tsep: rind = 'http://www.tomshardware.com/review_print.php?p1=' return rind + article_id def cleanup_image_tags(self, soup): for item in soup.findAll('img'): for attrib in ['height', 'width', 'border', 'align']: item[attrib] = '' del item[attrib] return soup def preprocess_html(self, soup): del(soup.body['onload']) for item in soup.findAll(style=True): del item['style'] for it in soup.findAll('span'): it.name = "div" return self.cleanup_image_tags(soup)