#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2019, Kovid Goyal from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def absolutize(href): if href.startswith('/'): href = 'https://www.lrb.co.uk' + href return href class LondonReviewOfBooksPayed(BasicNewsRecipe): title = 'London Review of Books' __author__ = 'Kovid Goyal' description = 'Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers' # noqa category = 'news, literature, UK' publisher = 'LRB Ltd.' language = 'en_GB' no_stylesheets = True delay = 1 encoding = 'utf-8' INDEX = 'https://www.lrb.co.uk/the-paper/' publication_type = 'magazine' needs_subscription = 'optional' requires_version = (3, 0, 0) masthead_url = 'https://www.mylrb.co.uk/out/lrb-2014/img/logo-2x.png' extra_css = ''' .article-word-count, #article-tag-holder { font-size:small; color:#202020; } .embedded-image-caption { font-size:small; text-align:center; } blockquote, em { color:#202020; } ''' resolve_internal_links = True keep_only_tags = [ dict(attrs={'id':['article-tag-holder', 'article-heading-holder']}), classes('article-copy article-word-count'), ] remove_attributes = ['style', 'width', 'height'] def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open('https://www.lrb.co.uk/login') br.select_form(id='login_form') br['_username'] = self.username br['_password'] = self.password raw = br.submit().read() if b'>My Account<' not in raw: raise ValueError('Failed to login check username and password') return br def preprocess_html(self, soup): for h2 in soup.findAll('h2'): h2.name = 'h4' for cap in soup.findAll(**classes('embedded-image-caption')): for p in cap.findAll('p'): p.name = 'div' for img in soup.findAll('img', attrs={'data-srcset': True}): img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1] return soup recipe_specific_options = { 'issue_url': { 'short': 'The issue URL ', 'long': 'For example, https://www.lrb.co.uk/the-paper/v46/n01', 'default': 'https://www.lrb.co.uk/the-paper/' } } def parse_index(self): d = self.recipe_specific_options.get('issue_url') if d and isinstance(d, str): self.INDEX = d soup = self.index_to_soup(self.INDEX) container = soup.find('div', attrs={'class': 'article-issue-cover-image'}) if container: self.cover_url = 'https://www.lrb.co.uk/storage/800_filter/images/' + container.img['data-appsrc'].split('/images/')[-1] edition = self.tag_to_string(soup.find('h1', attrs={'class': 'toc-title'})) self.timefmt = ' [{}]'.format(edition) self.log('Downloading: ', edition) grid = soup.find(attrs={'class': 'toc-grid-items'}) articles = [] for a in grid.findAll(**classes('toc-item')): url = absolutize(a['href']) h3 = a.find('h3') h4 = a.find('h4') title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4)) self.log(title, url) articles.append({'title': title, 'url': url}) return [('Articles', articles)]