__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' lrb.co.uk ''' import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class LondonReviewOfBooksPayed(BasicNewsRecipe): title = 'London Review of Books' __author__ = 'Rich Shang, Darko Miletic' description = 'Subscription content. Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers' # noqa category = 'news, literature, UK' publisher = 'LRB Ltd.' max_articles_per_feed = 100 language = 'en_GB' no_stylesheets = True delay = 1 use_embedded_content = False encoding = 'utf-8' INDEX = 'https://www.lrb.co.uk' LOGIN = INDEX + '/login' masthead_url = INDEX + '/assets/images/lrb_logo_big.gif' needs_subscription = True publication_type = 'magazine' requires_version = (3, 0, 0) extra_css = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} ' def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(action='/login') br['username'] = self.username br['password'] = self.password raw = br.submit().read() if 'You are logged in as' not in raw: raise ValueError('Failed to log in, check username and password') return br def parse_index(self): articles = [] soup = self.index_to_soup(self.INDEX) cover_item = soup.find('p', attrs={'class': 'cover'}) dates = str(soup.find('span', attrs={'class': 'coverdate'})) newdates = re.sub('\<.*\>', '', re.split('
', dates)[1]) self.timefmt = ' [%s]' % newdates lrbtitle = self.title if cover_item: self.cover_url = re.sub('/m/', '/l/', cover_item.a.img['src']) content = self.INDEX + cover_item.a['href'] soup2 = self.index_to_soup(content) sitem = soup2.find(attrs={'class': 'article-list'}) lrbtitle = soup2.head.title.string for item in sitem.findAll('a', attrs={'class': 'title'}): description = u'' title_prefix = u'' feed_link = item if feed_link.has_key('href'): # noqa url = self.INDEX + feed_link['href'] title_link = re.split('
', str(feed_link)) if len(title_link) > 1: title = title_prefix + \ re.sub( '\<.*\>', '', title_link[0]) + ' - ' + re.sub('\<.*\>', '', title_link[1]) else: title = title_prefix + self.tag_to_string(feed_link) desc = item.findNext('li') if desc is not None and desc.find('cite') is not None and desc.find('ul') is None: description = self.tag_to_string(desc) date = strftime(self.timefmt) articles.append({ 'title': title, 'date': date, 'url': url, 'description': description }) return [(lrbtitle, articles)] conversion_options = { 'comments': description, 'tags': category, 'language': language, 'publisher': publisher } keep_only_tags = [ dict(name='div', attrs={'class': ['article-body indent', 'letters']})] remove_attributes = ['width', 'height']