diff --git a/recipes/lrb.recipe b/recipes/lrb.recipe index b9016209d1..a43db65470 100644 --- a/recipes/lrb.recipe +++ b/recipes/lrb.recipe @@ -1,43 +1,61 @@ - -__license__ = 'GPL v3' -__copyright__ = '2008-2010, Darko Miletic ' -''' -lrb.co.uk -''' - +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2019, Kovid Goyal from calibre.web.feeds.news import BasicNewsRecipe -class LondonReviewOfBooks(BasicNewsRecipe): - title = 'London Review of Books (free)' - __author__ = 'Darko Miletic' +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + +def absolutize(href): + if href.startswith('/'): + href = 'https://www.lrb.co.uk' + href + return href + + +class LondonReviewOfBooksPayed(BasicNewsRecipe): + title = 'London Review of Books' + __author__ = 'Kovid Goyal' description = 'Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers' # noqa category = 'news, literature, UK' - publisher = 'LRB ltd.' - oldest_article = 15 - max_articles_per_feed = 100 + publisher = 'LRB Ltd.' language = 'en_GB' no_stylesheets = True - use_embedded_content = False + delay = 1 encoding = 'utf-8' + INDEX = 'https://www.lrb.co.uk' publication_type = 'magazine' - masthead_url = 'http://www.lrb.co.uk/assets/images/lrb_logo_big.gif' - extra_css = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} ' - - conversion_options = { - 'comments': description, 'tags': category, 'language': language, 'publisher': publisher - } + requires_version = (3, 0, 0) keep_only_tags = [ - dict(attrs={'class': ['article-body indent', 'letters', 'article-list']})] + classes('article-header--title paperArticle-reviewsHeader article-content letters-content'), + ] + remove_tags = [ + classes('social-button article-mask'), + ] remove_attributes = ['width', 'height'] - feeds = [(u'London Review of Books', u'http://www.lrb.co.uk/lrbrss.xml')] + def parse_index(self): + articles = [] + soup = self.index_to_soup(self.INDEX) + container = soup.find(attrs={'class': 'issue-grid'}) + img = container.find('img') + self.cover_url = img['data-srcset'].split()[-2] + h3 = container.find('h3') + self.timefmt = ' [{}]'.format(self.tag_to_string(h3)) + a = img.findParent('a') + soup = self.index_to_soup(absolutize(a['href'])) + grid = soup.find(attrs={'class': 'toc-grid-items'}) + articles = [] + for a in grid.findAll(**classes('toc-item')): + url = absolutize(a['href']) + h3 = a.find('h3') + h4 = a.find('h4') + title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4)) + self.log(title, url) + articles.append({'title': title, 'url': url}) - def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.lrb.co.uk/') - cover_item = soup.find('p', attrs={'class': 'cover'}) - if cover_item: - cover_url = cover_item.a.img['src'] - return cover_url + return [('Articles', articles)] diff --git a/recipes/lrb_payed.recipe b/recipes/lrb_payed.recipe deleted file mode 100644 index 6ad46764bd..0000000000 --- a/recipes/lrb_payed.recipe +++ /dev/null @@ -1,88 +0,0 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -lrb.co.uk -''' -import re -from calibre import strftime -from calibre.web.feeds.news import BasicNewsRecipe - - -class LondonReviewOfBooksPayed(BasicNewsRecipe): - title = 'London Review of Books' - __author__ = 'Rich Shang, Darko Miletic' - description = 'Subscription content. Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers' # noqa - category = 'news, literature, UK' - publisher = 'LRB Ltd.' - max_articles_per_feed = 100 - language = 'en_GB' - no_stylesheets = True - delay = 1 - use_embedded_content = False - encoding = 'utf-8' - INDEX = 'https://www.lrb.co.uk' - LOGIN = INDEX + '/login' - masthead_url = INDEX + '/assets/images/lrb_logo_big.gif' - needs_subscription = True - publication_type = 'magazine' - requires_version = (3, 0, 0) - extra_css = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} ' - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open(self.LOGIN) - br.select_form(action='/login') - br['username'] = self.username - br['password'] = self.password - raw = br.submit().read() - if b'You are logged in as' not in raw: - raise ValueError('Failed to log in, check username and password') - return br - - def parse_index(self): - articles = [] - soup = self.index_to_soup(self.INDEX) - cover_item = soup.find('p', attrs={'class': 'cover'}) - dates = type(u'')(soup.find('span', attrs={'class': 'coverdate'})) - try: - newdates = re.sub(r'\<.*\>', '', re.split(r'', dates)[1]) - self.timefmt = ' [%s]' % newdates - except Exception: - pass - lrbtitle = self.title - if cover_item: - self.cover_url = re.sub('/m/', '/l/', cover_item.a.img['src']) - content = self.INDEX + cover_item.a['href'] - soup2 = self.index_to_soup(content) - sitem = soup2.find(attrs={'class': 'article-list'}) - lrbtitle = soup2.head.title.string - for item in sitem.findAll('a', attrs={'class': 'title'}): - description = u'' - title_prefix = u'' - feed_link = item - if feed_link.get('href'): - url = self.INDEX + feed_link['href'] - title_link = re.split('
', str(feed_link)) - if len(title_link) > 1: - title = title_prefix + \ - re.sub( - r'\<.*\>', '', title_link[0]) + ' - ' + re.sub(r'\<.*\>', '', title_link[1]) - else: - title = title_prefix + self.tag_to_string(feed_link) - desc = item.findNext('li') - if desc is not None and desc.find('cite') is not None and desc.find('ul') is None: - description = self.tag_to_string(desc) - date = strftime(self.timefmt) - articles.append({ - 'title': title, 'date': date, 'url': url, 'description': description - }) - return [(lrbtitle, articles)] - - conversion_options = { - 'comments': description, 'tags': category, 'language': language, 'publisher': publisher - } - - keep_only_tags = [ - dict(name='div', attrs={'class': ['article-body indent', 'letters']})] - remove_attributes = ['width', 'height']