calibre/recipes/epicurious.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2010, Starson17'
'''
www.epicurious.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe

class Epicurious(BasicNewsRecipe):
    title          = u'Epicurious'
    __author__  = 'Starson17'
    description = 'Food and Recipes from Epicurious'
    cover_url     = 'http://up6.podbean.com/image-logos/21849_logo.jpg'
    publisher      = 'Epicurious'
    tags           = 'news, food, gourmet, recipes'
    language = 'en'
    use_embedded_content    = False
    no_stylesheets        = True
    remove_javascript = True
    recursions = 3
    oldest_article        = 14
    max_articles_per_feed = 20

    keep_only_tags = [dict(name='div', attrs={'class':['mainconsolewrapper','videoheader','content_unit','entry-content','see_more_block']}),
                      dict(name='div', attrs={'id':['headline','introBlock','ingredients','preparation','articleContent','in_categories_block']})
                           ]

    remove_tags = [{'id':['printShoppingList','addnoteLnk','btnUploadVideo','enlarge_image']},
                   {'class':['subLnk','sbmWrapper','detail_division','entry-footer','comment-footer']},
                   dict(name='div', attrs={'class':['tagged','comments']})
                   ]

    remove_tags_after = [dict(name='div', attrs={'class':'entry-content'})]

    feeds = [
             (u'Recipes: Healthy dinner ', u'http://feeds.epicurious.com/healthy_recipes'),
             (u'New Recipes ', u'http://feeds.epicurious.com/newrecipes'),
             (u'Features ', u'http://feeds.epicurious.com/latestfeatures'),
             (u'Blogs ', u'http://feeds.feedburner.com/epicurious/epiblog')
             ]

    match_regexps = [
                     r'http://www.epicurious.com/.*recipes/.*/views'
                     ]

    preprocess_regexps = [
        (re.compile(r'/\n', re.DOTALL|re.IGNORECASE), lambda match: '/'),
        (re.compile(r'_116.jpg', re.DOTALL|re.IGNORECASE), lambda match: '.jpg'),
        (re.compile('<div class=\"comments\".*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')
        ]

    def postprocess_html(self, soup, first_fetch):
        for t in soup.findAll(['table', 'tr', 'td']):
            t.name = 'div'
        return soup