#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' ''' nybooks.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe def find_header(tag): return tag.name == 'header' and tag.parent['class'] == 'article' def absurl(url): if url.startswith('/'): url = 'http://www.nybooks.com' + url return url class NewYorkReviewOfBooks(BasicNewsRecipe): title = u'New York Review of Books' description = u'Book reviews' language = 'en' __author__ = 'Kovid Goyal' no_stylesheets = True no_javascript = True needs_subscription = True keep_only_tags = [ dict(name='section', attrs={'class': 'article_body'}), dict(name=find_header), dict(name='div', attrs={ 'class': ['footnotes', 'for-subscribers-only']}), ] preprocess_regexps = [(re.compile(r'.*?', re.DOTALL), lambda m:'')] def print_version(self, url): if '?' in url: url = url.rpartition('?')[0] return url + '?pagination=false' def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('http://www.nybooks.com/account/signin/') br.select_form(nr=2) br['user_login'] = self.username br['user_password'] = self.password br.submit() return br def preprocess_html(self, soup): header = soup.find('header') body = soup.find('body') body.insert(0, header) header.find('div', attrs={'class': 'details'}).extract() for i in soup.findAll('input'): i.extract() return soup def postprocess_html(self, soup, first): for img in soup.findAll('img', srcset=True): del img['srcset'] return soup def parse_index(self): soup = self.index_to_soup('http://www.nybooks.com/current-issue') # Find cover sidebar = soup.find('div', attrs={'class': 'issue_cover'}) if sidebar is not None: img = sidebar.find('img', src=True) self.cover_url = absurl(img['src']) self.log('Found cover at:', self.cover_url) # Find date div = soup.find('time', pubdate='pubdate') if div is not None: text = self.tag_to_string(div) date = text.partition(u'\u2022')[0].strip() self.timefmt = u' [%s]' % date self.log('Issue date:', date) # Find TOC tocs = soup.find('div', attrs={'class': 'current_issue'}).findAll( 'div', attrs={'class': 'articles_list'}) articles = [] for toc in tocs: for div in toc.findAll('div', attrs={'class': 'row'}): h2 = div.find('h2') title = self.tag_to_string(h2).strip() author = self.tag_to_string( div.find('div', attrs={'class': 'author'})).strip() title = title + u' (%s)' % author url = absurl(h2.find('a', href=True)['href']) desc = '' for p in div.findAll('p', attrs={'class': lambda x: x and 'quiet' in x}): desc += self.tag_to_string(p) self.log('Found article:', title) self.log('\t', url) self.log('\t', desc) articles.append({'title': title, 'url': url, 'date': '', 'description': desc}) return [('Current Issue', articles)]