calibre/resources/recipes/tomshardware.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
tomshardware.com/us
'''

import urllib
from calibre.web.feeds.recipes import BasicNewsRecipe

class Tomshardware(BasicNewsRecipe):
    title               = "Tom's Hardware US"
    __author__          = 'Darko Miletic'
    description         = 'Hardware reviews and News'
    publisher           = "Tom's Hardware"
    category            = 'news, IT, hardware, USA'
    no_stylesheets      = True
    needs_subscription  = True
    language = 'en'

    INDEX               = 'http://www.tomshardware.com'
    LOGIN               = INDEX + '/membres/'
    remove_javascript   = True
    use_embedded_content= False

    html2lrf_options = [
                          '--comment', description
                        , '--category', category
                        , '--publisher', publisher
                        ]

    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open(self.INDEX+'/us/')
        if self.username is not None and self.password is not None:
            data = urllib.urlencode({ 'action':'login_action'
                                     ,'r':self.INDEX+'/us/'
                                     ,'login':self.username
                                     ,'mdp':self.password
                                   })
            br.open(self.LOGIN,data)
        return br

    remove_tags = [
                     dict(name='div' , attrs={'id':'header' })
                    ,dict(name='object')
                  ]

    feeds = [
              (u'Latest Articles', u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-2.xml'          )
             ,(u'Latest News'    , u'http://www.tomshardware.com/feeds/atom/tom-s-hardware-us,18-1.xml')
            ]

    def print_version(self, url):
        main, sep, rest = url.rpartition('.html')
        rmain, rsep, article_id = main.rpartition(',')
        tmain, tsep, trest = rmain.rpartition('/reviews/')
        rind = 'http://www.tomshardware.com/news_print.php?p1='
        if tsep:
           rind = 'http://www.tomshardware.com/review_print.php?p1='
        return rind + article_id

    def cleanup_image_tags(self,soup):
        for item in soup.findAll('img'):
            for attrib in ['height','width','border','align']:
                if item.has_key(attrib):
                   del item[attrib]
        return soup

    def preprocess_html(self, soup):
        del(soup.body['onload'])
        for item in soup.findAll(style=True):
            del item['style']
        for it in soup.findAll('span'):
            it.name="div"
        return self.cleanup_image_tags(soup)