From e1b25b6f529af478d8777bc7d804b318f6ce140b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 9 Oct 2013 10:31:11 +0530 Subject: [PATCH] Update New York Review of Books Fixes #1235790 [Private bug](https://bugs.launchpad.net/calibre/+bug/1235790) --- recipes/new_york_review_of_books.recipe | 79 ++++++++++--------- .../new_york_review_of_books_no_sub.recipe | 57 +++++++------ 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/recipes/new_york_review_of_books.recipe b/recipes/new_york_review_of_books.recipe index 2da9536da3..09d178b3ba 100644 --- a/recipes/new_york_review_of_books.recipe +++ b/recipes/new_york_review_of_books.recipe @@ -1,4 +1,3 @@ - #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' @@ -11,6 +10,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +def find_header(tag): + return tag.name == 'header' and tag.parent['class'] == 'article' + class NewYorkReviewOfBooks(BasicNewsRecipe): title = u'New York Review of Books' @@ -23,65 +25,70 @@ class NewYorkReviewOfBooks(BasicNewsRecipe): no_javascript = True needs_subscription = True - keep_only_tags = [dict(id=['article-body','page-title'])] - remove_tags = [dict(attrs={'class':['article-tools', 'article-links', - 'center advertisement']})] + keep_only_tags = [ + dict(name='section', attrs={'class':'article_body'}), + dict(name=find_header), + dict(name='div', attrs={'class':'for-subscribers-only'}), + ] preprocess_regexps = [(re.compile(r'.*?', re.DOTALL), lambda m:'')] + def print_version(self, url): + return url+'?pagination=false' + def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('http://www.nybooks.com/account/signin/') - br.select_form(nr = 1) + br.select_form(nr=2) br['username'] = self.username br['password'] = self.password br.submit() return br - def print_version(self, url): - return url+'?pagination=false' + def preprocess_html(self, soup): + header = soup.find('header') + body = soup.find('body') + body.insert(0, header) + header.find('div', attrs={'class':'details'}).extract() + for i in soup.findAll('input'): + i.extract() + return soup def parse_index(self): soup = self.index_to_soup('http://www.nybooks.com/current-issue') # Find cover - sidebar = soup.find(id='sidebar') + sidebar = soup.find('div', attrs={'class':'issue_cover'}) if sidebar is not None: - a = sidebar.find('a', href=lambda x: x and 'view-photo' in x) - if a is not None: - psoup = self.index_to_soup('http://www.nybooks.com'+a['href']) - cover = psoup.find('img', src=True) - self.cover_url = cover['src'] - self.log('Found cover at:', self.cover_url) + img = sidebar.find('img', src=True) + self.cover_url = 'http://www.nybooks.com' + img['src'] + self.log('Found cover at:', self.cover_url) # Find date - div = soup.find(id='page-title') + div = soup.find('time', pubdate='pubdate') if div is not None: - h5 = div.find('h5') - if h5 is not None: - text = self.tag_to_string(h5) - date = text.partition(u'\u2022')[0].strip() - self.timefmt = u' [%s]'%date - self.log('Issue date:', date) + text = self.tag_to_string(div) + date = text.partition(u'\u2022')[0].strip() + self.timefmt = u' [%s]'%date + self.log('Issue date:', date) # Find TOC - tocs = soup.findAll('ul', attrs={'class':'issue-article-list'}) + toc = soup.find('div', attrs={'class':'current_issue'}).find('div', attrs={'class':'articles_list'}) articles = [] - for toc in tocs: - for li in toc.findAll('li'): - h3 = li.find('h3') - title = self.tag_to_string(h3) - author = self.tag_to_string(li.find('h4')) - title = title + u' (%s)'%author - url = 'http://www.nybooks.com'+h3.find('a', href=True)['href'] - desc = '' - for p in li.findAll('p'): - desc += self.tag_to_string(p) - self.log('Found article:', title) - self.log('\t', url) - self.log('\t', desc) - articles.append({'title':title, 'url':url, 'date':'', + for div in toc.findAll('div', attrs={'class':'row'}): + h2 = div.find('h2') + title = self.tag_to_string(h2).strip() + author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip() + title = title + u' (%s)'%author + url = 'http://www.nybooks.com' + h2.find('a', href=True)['href'] + desc = '' + for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}): + desc += self.tag_to_string(p) + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'', 'description':desc}) return [('Current Issue', articles)] diff --git a/recipes/new_york_review_of_books_no_sub.recipe b/recipes/new_york_review_of_books_no_sub.recipe index e462689403..90a4fd8544 100644 --- a/recipes/new_york_review_of_books_no_sub.recipe +++ b/recipes/new_york_review_of_books_no_sub.recipe @@ -10,6 +10,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe +def find_header(tag): + return tag.name == 'header' and tag.parent['class'] == 'article' + class NewYorkReviewOfBooks(BasicNewsRecipe): title = u'New York Review of Books (no subscription)' @@ -21,9 +24,11 @@ class NewYorkReviewOfBooks(BasicNewsRecipe): no_stylesheets = True no_javascript = True - keep_only_tags = [dict(id=['article-body', 'page-title'])] - remove_tags = [dict(attrs={'class':['article-tools', 'article-links', - 'center advertisement']})] + keep_only_tags = [ + dict(name='section', attrs={'class':'article_body'}), + dict(name=find_header), + dict(name='div', attrs={'class':'for-subscribers-only'}), + ] preprocess_regexps = [(re.compile(r'.*?', re.DOTALL), lambda m:'')] @@ -31,40 +36,44 @@ class NewYorkReviewOfBooks(BasicNewsRecipe): def print_version(self, url): return url+'?pagination=false' + def preprocess_html(self, soup): + header = soup.find('header') + body = soup.find('body') + body.insert(0, header) + header.find('div', attrs={'class':'details'}).extract() + for i in soup.findAll('input'): + i.extract() + return soup + def parse_index(self): soup = self.index_to_soup('http://www.nybooks.com/current-issue') # Find cover - sidebar = soup.find(id='sidebar') + sidebar = soup.find('div', attrs={'class':'issue_cover'}) if sidebar is not None: - a = sidebar.find('a', href=lambda x: x and 'view-photo' in x) - if a is not None: - psoup = self.index_to_soup('http://www.nybooks.com'+a['href']) - cover = psoup.find('img', src=True) - self.cover_url = cover['src'] - self.log('Found cover at:', self.cover_url) + img = sidebar.find('img', src=True) + self.cover_url = 'http://www.nybooks.com' + img['src'] + self.log('Found cover at:', self.cover_url) # Find date - div = soup.find(id='page-title') + div = soup.find('time', pubdate='pubdate') if div is not None: - h5 = div.find('h5') - if h5 is not None: - text = self.tag_to_string(h5) - date = text.partition(u'\u2022')[0].strip() - self.timefmt = u' [%s]'%date - self.log('Issue date:', date) + text = self.tag_to_string(div) + date = text.partition(u'\u2022')[0].strip() + self.timefmt = u' [%s]'%date + self.log('Issue date:', date) # Find TOC - toc = soup.find('ul', attrs={'class':'issue-article-list'}) + toc = soup.find('div', attrs={'class':'current_issue'}).find('div', attrs={'class':'articles_list'}) articles = [] - for li in toc.findAll('li'): - h3 = li.find('h3') - title = self.tag_to_string(h3) - author = self.tag_to_string(li.find('h4')) + for div in toc.findAll('div', attrs={'class':'row'}): + h2 = div.find('h2') + title = self.tag_to_string(h2).strip() + author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip() title = title + u' (%s)'%author - url = 'http://www.nybooks.com'+h3.find('a', href=True)['href'] + url = 'http://www.nybooks.com' + h2.find('a', href=True)['href'] desc = '' - for p in li.findAll('p'): + for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}): desc += self.tag_to_string(p) self.log('Found article:', title) self.log('\t', url)