mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update The Baffler
This commit is contained in:
parent
0d22b5f3ef
commit
92bef3ec5e
@ -1,68 +1,85 @@
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
return dict(
|
|
||||||
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TheBaffler(BasicNewsRecipe):
|
class TheBaffler(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The Baffler'
|
title = 'The Baffler'
|
||||||
__author__ = 'Jose Ortiz'
|
__author__ = 'unkn0wn'
|
||||||
description = ('This magazine contains left-wing criticism, cultural analysis, shorts'
|
description = ('This magazine contains left-wing criticism, cultural analysis, shorts'
|
||||||
' stories, poems and art. They publish six print issues annually.')
|
' stories, poems and art. They publish six print issues annually.')
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_attributes = ['style','height','width']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.entry-subtitle{color:#202020; font-style:italic; text:align:left;}
|
||||||
|
blockquote{color:gray;}
|
||||||
|
em{color:#404040;}
|
||||||
|
.wp-caption-text{font-size:small; text-align:center;}
|
||||||
|
.lg:text-xs{color:gray; font-size:small; text-align:center;}
|
||||||
|
.author-meta{font-size:small; color:gray;}
|
||||||
|
'''
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
classes('header-contain entry-content')
|
dict(name='main', attrs={'id':'main'})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
classes('entry-date issue-number-segment single-article-vertical donation-footer'),
|
||||||
|
dict(name='footer')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('https://shop.exacteditions.com/us/the-baffler')
|
||||||
|
tag = soup.find('div', attrs={'class': 'row'})
|
||||||
|
if tag:
|
||||||
|
self.cover_url = tag.find('img')['src']
|
||||||
|
return getattr(self, 'cover_url', self.cover_url)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('https://thebaffler.com/issues').main.article
|
soup = self.index_to_soup('https://thebaffler.com/issues')
|
||||||
self.timefmt = ' [%s]' % self.tag_to_string(soup.find(**classes('date'))).strip()
|
issue = soup.find('article')
|
||||||
try:
|
edition = self.tag_to_string(issue.find('h3')).strip().split('—')[1]
|
||||||
self.cover_url = re.sub(
|
if edition:
|
||||||
r'.*?url\((.*?)\).*', r'\1',
|
self.log('Downloading Issue: ', edition)
|
||||||
soup.find(**classes('image-fill'))['style']).strip()
|
self.title = 'The Baffler : ' + edition
|
||||||
self.log('cover_url at ', self.cover_url)
|
self.timefmt = ' [' + self.tag_to_string(issue.find('div', **classes('font-lion'))).strip() + ']'
|
||||||
except:
|
a = issue.find('a')
|
||||||
self.log.error('Failed to download cover_url')
|
|
||||||
|
|
||||||
soup = self.index_to_soup(soup.a['href'])
|
|
||||||
|
|
||||||
# Extract comments from `.entry-content' and prepend to self.description
|
|
||||||
self.description = (
|
self.description = (
|
||||||
u'\n\n' + self.tag_to_string(soup.find(**classes('entry-content'))) +
|
u'\n\n' + self.tag_to_string(a).strip() +
|
||||||
u'\n\n' + self.description
|
u'\n\n' + self.description
|
||||||
)
|
)
|
||||||
|
|
||||||
|
soup = self.index_to_soup(a['href'])
|
||||||
ans = []
|
ans = []
|
||||||
|
main = soup.find('main', attrs={'id':'main'})
|
||||||
# Articles at `.contents section .meta'
|
for section in main.findAll('section'):
|
||||||
for section in soup.find(**classes('contents'))('section'):
|
current_section = self.tag_to_string(section.h1).strip()
|
||||||
current_section = self.tag_to_string(section.h2)
|
|
||||||
self.log(current_section)
|
self.log(current_section)
|
||||||
articles = []
|
articles = []
|
||||||
for div in section(**classes('meta')):
|
for h3 in section.findAll('h3'):
|
||||||
# Getting articles
|
title = self.tag_to_string(h3)
|
||||||
a = div.find(**classes('title')).a
|
url = h3.a['href']
|
||||||
title = self.tag_to_string(a)
|
|
||||||
url = a['href']
|
|
||||||
self.log('\t', title, ' at ', url)
|
|
||||||
desc = ''
|
desc = ''
|
||||||
r = div.find(**classes('deck'))
|
span = h3.findNext('span')
|
||||||
if r is not None:
|
if span:
|
||||||
desc = self.tag_to_string(r)
|
desc = self.tag_to_string(span).strip()
|
||||||
|
span2 = span.findNext('span')
|
||||||
|
if span2:
|
||||||
|
desc = self.tag_to_string(span2).strip() + ' | ' + desc
|
||||||
|
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
|
||||||
articles.append(
|
articles.append(
|
||||||
{'title': title, 'url': url, 'description': desc})
|
{'title': title, 'url': url, 'description': desc})
|
||||||
if current_section and articles:
|
if articles:
|
||||||
ans.append((current_section,articles))
|
ans.append((current_section,articles))
|
||||||
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
div = soup.find('div', **classes('entry-title'))
|
||||||
|
if div:
|
||||||
|
div.name = 'h1'
|
||||||
|
for p in soup.findAll('p', attrs={'class':'parasectionhed'}):
|
||||||
|
p.name = 'h4'
|
||||||
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user