calibre/recipes/wired_uk.recipe

__license__   = 'GPL v3'
__copyright__ = '2011, Starson17 <Starson17 at gmail.com>'
'''
www.wired.co.uk
'''

from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import re

class Wired_UK(BasicNewsRecipe):
    title                 = 'Wired Magazine - UK edition'
    __author__            = 'Starson17'
    __version__           = 'v1.30'
    __date__              = '15 July 2011'
    description           = 'Gaming news'
    publisher             = 'Conde Nast Digital'
    category              = 'news, games, IT, gadgets'
    oldest_article        = 40
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
    #masthead_url          = 'http://www.wired.co.uk/_/media/wired-logo_UK.gif'
    language              = 'en_GB'
    index                 = 'http://www.wired.co.uk'

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    keep_only_tags = [dict(name='div', attrs={'class':['layoutColumn1']})]
    remove_tags = [dict(name='div',attrs={'class':['articleSidebar1','commentAddBox linkit','commentCountBox commentCountBoxBig']})]
    remove_tags_after = dict(name='div',attrs={'class':['mainCopy entry-content','mainCopy']})
    '''
    remove_attributes = ['height','width']
                   ,dict(name=['object','embed','iframe','link'])
                   ,dict(attrs={'class':['opts','comment','stories']})
                   ]
    '''
    def parse_index(self):
        totalfeeds = []
        soup   = self.index_to_soup(self.index)
        recentcontent = soup.find('ul',attrs={'class':'linkList3'})
        mfeed = []
        if recentcontent:
          for li in recentcontent.findAll('li'):
            a = li.h2.a
            url  = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
        totalfeeds.append(('Wired UK Magazine Latest News', mfeed))
        popmagcontent = soup.findAll('div',attrs={'class':'sidebarLinkList'})
        magcontent = popmagcontent[1]
        mfeed2 = []
        if magcontent:
          a = magcontent.h3.a
          if a:
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed2.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          for li in magcontent.findAll('li'):
            a = li.a
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed2.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          totalfeeds.append(('Wired UK Magazine Features', mfeed2))

        magsoup = self.index_to_soup(self.index + '/magazine')
        startcontent = magsoup.find('h3',attrs={'class':'magSubSectionTitle titleStart'}).parent
        mfeed3 = []
        if startcontent:
          for li in startcontent.findAll('li'):
            a = li.a
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed3.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          totalfeeds.append(('Wired UK Magazine More', mfeed3))

        playcontent = magsoup.find('h3',attrs={'class':'magSubSectionTitle titlePlay'}).parent
        mfeed4 = []
        if playcontent:
          for li in playcontent.findAll('li'):
            a = li.a
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed4.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          totalfeeds.append(('Wired UK Magazine Play', mfeed4))
        return totalfeeds

    def get_cover_url(self):
        cover_url = ''
        soup = self.index_to_soup(self.index + '/magazine/archive')
        cover_item = soup.find('div', attrs={'class':'image linkme'})
        if cover_item:
           cover_url = cover_item.img['src']
        return cover_url

    def preprocess_html(self, soup):
        for tag in soup.findAll(name='p'):
            if tag.find(name='span', text=re.compile(r'This article was taken from.*', re.DOTALL|re.IGNORECASE)):
                tag.extract()
        return soup

    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''