from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class NewYorkTimesBookReview(BasicNewsRecipe): title = u'New York Times Book Review' language = 'en' __author__ = 'Krittika Goyal' oldest_article = 8 #days max_articles_per_feed = 1000 recursions = 2 #encoding = 'latin1' remove_stylesheets = True #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) remove_tags_after = dict(name='div', attrs={'id':'authorId'}) remove_tags = [ dict(name='iframe'), dict(name=['div', 'a'], attrs={'class':['enlargeThis', 'jumpLink']}), dict(name='div', attrs={'id':['sidebarArticles', 'toolsRight']}), #dict(name='ul', attrs={'class':'article-tools'}), #dict(name='ul', attrs={'class':'articleTools'}), ] match_regexps = [ r'http://www.nytimes.com/.+pagewanted=[2-9]+' ] feeds = [ ('New York Times Sunday Book Review', 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'), ] def preprocess_html(self, soup): story = soup.find(name='div', attrs={'id':'article'}) #td = heading.findParent(name='td') #td.extract() soup = BeautifulSoup('