From e9ff5e25994d85f02eefd55b069b682adbf35761 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 27 Oct 2012 22:15:10 +0530 Subject: [PATCH] Update New York Times Book Review --- recipes/nytimesbook.recipe | 51 ++++++-------------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 686f30b69a..5388da9dcb 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class NewYorkTimesBookReview(BasicNewsRecipe): title = u'New York Times Book Review' @@ -7,50 +6,16 @@ class NewYorkTimesBookReview(BasicNewsRecipe): __author__ = 'Krittika Goyal' oldest_article = 8 #days max_articles_per_feed = 1000 - recursions = 2 + #recursions = 2 #encoding = 'latin1' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True - remove_stylesheets = True - #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - remove_tags_after = dict(name='div', attrs={'id':'authorId'}) - remove_tags = [ - dict(name='iframe'), - dict(name=['div', 'a'], attrs={'class':['enlargeThis', 'jumpLink']}), - dict(name='div', attrs={'id':['sidebarArticles', 'toolsRight']}), - #dict(name='ul', attrs={'class':'article-tools'}), - #dict(name='ul', attrs={'class':'articleTools'}), - ] - match_regexps = [ - r'http://www.nytimes.com/.+pagewanted=[2-9]+' - ] feeds = [ -('New York Times Sunday Book Review', - 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'), -] + ('New York Times Sunday Book Review', + 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'), + ] - - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'article'}) - #td = heading.findParent(name='td') - #td.extract() - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - #for x in soup.findAll(name='p', text=lambda x:x and '-->' in x): - #p = x.findParent('p') - #if p is not None: - #p.extract() - return soup - - def postprocess_html(self, soup, first): - for div in soup.findAll(id='pageLinks'): - div.extract() - if not first: - h1 = soup.find('h1') - if h1 is not None: - h1.extract() - t = soup.find(attrs={'class':'timestamp'}) - if t is not None: - t.extract() - return soup