calibre/recipes/dot_net.recipe

from calibre.web.feeds.news import BasicNewsRecipe
import re

class dotnetMagazine (BasicNewsRecipe):
    __author__ = u'Bonni Salles'
    __version__ = '1.1'
    __license__   = 'GPL v3'
    __copyright__ = u'2013, Bonni Salles'
    title                 = '.net magazine'
    oldest_article        = 7
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    # recursion = 1
    language              = 'en'
    remove_empty_feeds    = True
    extra_css             = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
    cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'

    remove_tags_after = dict(name='footer', id=lambda x:not x)
    remove_tags_before = dict(name='header', id=lambda x:not x)

    remove_tags = [
         dict(name='div', attrs={'class': 'item-list'}),
         dict(name='h4', attrs={'class': 'std-hdr'}),
         dict(name='div', attrs={'class': 'item-list share-links'}),  # removes share links
         dict(name=['script', 'noscript']),
         dict(name='div', attrs={'id': 'comments-form'}),  # comment these out if you want the comments to show
         dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
         dict(name='div', attrs={'id': 'right-col'}),
         dict(name='div', attrs={'id': 'comments'}),  # comment these out if you want the comments to show
         dict(name='div', attrs={'class': 'item-list related-content'}),

         ]

    feeds = [
               (u'net', u'http://feeds.feedburner.com/net/topstories?format=xml')
            ]

    def skip_ad_pages(self, soup):
        text = soup.find(text='click here to continue to article')
        if text:
            a = text.parent
            url = a.get('href')
            if url:
                return self.index_to_soup(url, raw=True)