calibre/recipes/big_oven.recipe

from calibre.web.feeds.news import BasicNewsRecipe
import re

class BigOven(BasicNewsRecipe):
    title               = 'BigOven'
    __author__          = 'Starson17'
    description         = 'Recipes for the Foodie in us all. Registration is free. A fake username and password just gives smaller photos.'
    language            = 'en'
    category            = 'news, food, recipes, gourmet'
    publisher           = 'Starson17'
    use_embedded_content= False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds    = True
    cover_url           = 'http://www.software.com/images/products/BigOven%20Logo_177_216.JPG'
    max_articles_per_feed = 30
    needs_subscription = True

    conversion_options = {'linearize_tables'  : True
                        , 'comment'           : description
                        , 'tags'              : category
                        , 'publisher'         : publisher
                        , 'language'          : language
                        }

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.bigoven.com/account/login?ReturnUrl=/')
            br.select_form(nr=1)
            br['Email']  = self.username
            br['Password'] = self.password
            br.submit()
        return br

    remove_attributes = ['style', 'font']

    remove_tags     = [dict(name='div', attrs={'class':['ppy-caption']})
                                  ,dict(name='div', attrs={'id':['float_corner']})
                                  ]

    def preprocess_html(self, soup):
        for tag in soup.findAll(name='a', attrs={'class':['deflink']}):
          tag.replaceWith(tag.string)
        for tag in soup.findAll(name='a', text=re.compile(r'.*View Metric.*', re.DOTALL)):
          tag.parent.parent.extract()
        for tag in soup.findAll(name='a', text=re.compile(r'.*Add my own photo.*', re.DOTALL)):
          tag.parent.parent.extract()
        for tag in soup.findAll(name='div', attrs={'class':['container']}):
          if tag.find(name='h1'):
              continue
          if tag.find(name='h2', text=re.compile(r'.*Ingredients.*', re.DOTALL)):
              print 'tag found Ingred h2'
              continue
          if tag.find(name='h2', text=re.compile(r'Preparation.*', re.DOTALL)):
              print 'tag found Prep h2'
              continue
          tag.extract()
        return soup

    feeds = [(u'4 & 5 Star Rated Recipes', u'http://feeds.feedburner.com/Bigovencom-RecipeRaves?format=xml')]