From 92bef3ec5e195c246ce6a109ebd10af853d20f49 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 4 Sep 2022 14:32:13 +0530 Subject: [PATCH] Update The Baffler --- recipes/the_baffler.recipe | 97 ++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/recipes/the_baffler.recipe b/recipes/the_baffler.recipe index 8fb9c96ebb..5f097741d3 100644 --- a/recipes/the_baffler.recipe +++ b/recipes/the_baffler.recipe @@ -1,68 +1,85 @@ -from calibre.web.feeds.recipes import BasicNewsRecipe -import re - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict( - attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} - ) +from calibre.web.feeds.news import BasicNewsRecipe, classes class TheBaffler(BasicNewsRecipe): - title = 'The Baffler' - __author__ = 'Jose Ortiz' + __author__ = 'unkn0wn' description = ('This magazine contains left-wing criticism, cultural analysis, shorts' ' stories, poems and art. They publish six print issues annually.') language = 'en' encoding = 'UTF-8' no_javascript = True no_stylesheets = True + remove_attributes = ['style','height','width'] + + extra_css = ''' + .entry-subtitle{color:#202020; font-style:italic; text:align:left;} + blockquote{color:gray;} + em{color:#404040;} + .wp-caption-text{font-size:small; text-align:center;} + .lg:text-xs{color:gray; font-size:small; text-align:center;} + .author-meta{font-size:small; color:gray;} + ''' keep_only_tags = [ - classes('header-contain entry-content') + dict(name='main', attrs={'id':'main'}) ] + remove_tags = [ + classes('entry-date issue-number-segment single-article-vertical donation-footer'), + dict(name='footer') + ] + + def get_cover_url(self): + soup = self.index_to_soup('https://shop.exacteditions.com/us/the-baffler') + tag = soup.find('div', attrs={'class': 'row'}) + if tag: + self.cover_url = tag.find('img')['src'] + return getattr(self, 'cover_url', self.cover_url) + def parse_index(self): - soup = self.index_to_soup('https://thebaffler.com/issues').main.article - self.timefmt = ' [%s]' % self.tag_to_string(soup.find(**classes('date'))).strip() - try: - self.cover_url = re.sub( - r'.*?url\((.*?)\).*', r'\1', - soup.find(**classes('image-fill'))['style']).strip() - self.log('cover_url at ', self.cover_url) - except: - self.log.error('Failed to download cover_url') + soup = self.index_to_soup('https://thebaffler.com/issues') + issue = soup.find('article') + edition = self.tag_to_string(issue.find('h3')).strip().split('—')[1] + if edition: + self.log('Downloading Issue: ', edition) + self.title = 'The Baffler : ' + edition + self.timefmt = ' [' + self.tag_to_string(issue.find('div', **classes('font-lion'))).strip() + ']' + a = issue.find('a') - soup = self.index_to_soup(soup.a['href']) - - # Extract comments from `.entry-content' and prepend to self.description self.description = ( - u'\n\n' + self.tag_to_string(soup.find(**classes('entry-content'))) + + u'\n\n' + self.tag_to_string(a).strip() + u'\n\n' + self.description ) + soup = self.index_to_soup(a['href']) ans = [] - - # Articles at `.contents section .meta' - for section in soup.find(**classes('contents'))('section'): - current_section = self.tag_to_string(section.h2) + main = soup.find('main', attrs={'id':'main'}) + for section in main.findAll('section'): + current_section = self.tag_to_string(section.h1).strip() self.log(current_section) articles = [] - for div in section(**classes('meta')): - # Getting articles - a = div.find(**classes('title')).a - title = self.tag_to_string(a) - url = a['href'] - self.log('\t', title, ' at ', url) + for h3 in section.findAll('h3'): + title = self.tag_to_string(h3) + url = h3.a['href'] desc = '' - r = div.find(**classes('deck')) - if r is not None: - desc = self.tag_to_string(r) + span = h3.findNext('span') + if span: + desc = self.tag_to_string(span).strip() + span2 = span.findNext('span') + if span2: + desc = self.tag_to_string(span2).strip() + ' | ' + desc + self.log('\t', title, '\n\t', desc, '\n\t\t', url) articles.append( {'title': title, 'url': url, 'description': desc}) - if current_section and articles: + if articles: ans.append((current_section,articles)) - return ans + + def preprocess_html(self, soup): + div = soup.find('div', **classes('entry-title')) + if div: + div.name = 'h1' + for p in soup.findAll('p', attrs={'class':'parasectionhed'}): + p.name = 'h4' + return soup