This commit is contained in:
Kovid Goyal 2024-05-17 12:06:44 +05:30
commit 838a056ad2
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 42 additions and 31 deletions

View File

@ -26,18 +26,24 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
delay = 1 delay = 1
encoding = 'utf-8' encoding = 'utf-8'
INDEX = 'https://www.lrb.co.uk' INDEX = 'https://www.lrb.co.uk/the-paper/'
publication_type = 'magazine' publication_type = 'magazine'
needs_subscription = True needs_subscription = 'optional'
requires_version = (3, 0, 0) requires_version = (3, 0, 0)
masthead_url = 'https://www.mylrb.co.uk/out/lrb-2014/img/logo-2x.png'
extra_css = '''
.article-word-count, #article-tag-holder { font-size:small; color:#202020; }
.embedded-image-caption { font-size:small; text-align:center; }
blockquote, em { color:#202020; }
'''
resolve_internal_links = True
keep_only_tags = [ keep_only_tags = [
classes('article-header--title paperArticle-reviewsHeader article-content letters-content'), dict(attrs={'id':['article-tag-holder', 'article-heading-holder']}),
classes('article-copy article-word-count'),
] ]
remove_tags = [
classes('social-button article-mask lrb-readmorelink article-send-letter article-share'), remove_attributes = ['style', 'width', 'height']
]
remove_attributes = ['width', 'height']
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
@ -52,22 +58,23 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
return br return br
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'):
h2.name = 'h4'
for cap in soup.findAll(**classes('embedded-image-caption')):
for p in cap.findAll('p'):
p.name = 'div'
for img in soup.findAll('img', attrs={'data-srcset': True}): for img in soup.findAll('img', attrs={'data-srcset': True}):
for x in img['data-srcset'].split(): img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1]
if '/' in x:
img['src'] = x
return soup return soup
def parse_index(self): def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
container = soup.find(attrs={'class': 'issue-grid'}) container = soup.find('div', attrs={'class': 'article-issue-cover-image'})
img = container.find('img') if container:
self.cover_url = img['data-srcset'].split()[-2] self.cover_url = 'https://www.lrb.co.uk/storage/800_filter/images/' + container.img['data-appsrc'].split('/images/')[-1]
h3 = container.find('h3') edition = self.tag_to_string(soup.find('h1', attrs={'class': 'toc-title'}))
self.timefmt = ' [{}]'.format(self.tag_to_string(h3)) self.timefmt = ' [{}]'.format(edition)
a = img.findParent('a') self.log('Downloading: ', edition)
soup = self.index_to_soup(absolutize(a['href']))
grid = soup.find(attrs={'class': 'toc-grid-items'}) grid = soup.find(attrs={'class': 'toc-grid-items'})
articles = [] articles = []
for a in grid.findAll(**classes('toc-item')): for a in grid.findAll(**classes('toc-item')):
@ -77,5 +84,4 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4)) title = '{}: {}'.format(self.tag_to_string(h3), self.tag_to_string(h4))
self.log(title, url) self.log(title, url)
articles.append({'title': title, 'url': url}) articles.append({'title': title, 'url': url})
return [('Articles', articles)] return [('Articles', articles)]

View File

@ -85,6 +85,11 @@ class WSJ(BasicNewsRecipe):
div = col.findParent('div') div = col.findParent('div')
if div: if div:
div.extract() div.extract()
time = soup.find('time')
if time:
p = time.findParent('div')
if p:
p.name = 'p'
return soup return soup
if not past_edition: if not past_edition:
@ -117,30 +122,26 @@ class WSJ(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' index = 'https://bartender.mobile.dowjones.io'
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
edit = [] edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
for itm in catalog['items']:
if itm['type'] == 'ITP':
edit.append(itm['key'][3:])
self.log('**Past Editions available :', ', '.join(edit)) self.log('**Past Editions available :', ', '.join(edit))
for itm in catalog['items']: for itm in catalog['items']:
if past_edition: if past_edition:
if itm['key'] == 'ITP' + past_edition: if itm['key'] == 'ITP' + past_edition:
key = itm['key'] key = itm['key']
manifest = itm['manifest'] manifest = itm['manifest']
dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone) date = itm['date']
dt = dt.strftime('%b %d, %Y')
self.log('Downloading Past Edition ', dt)
self.timefmt = ' [' + dt + ']'
break break
elif itm['type'] == 'ITP': elif itm['type'] == 'ITP':
key = itm['key'] key = itm['key']
manifest = itm['manifest'] manifest = itm['manifest']
dt = datetime.fromisoformat(itm['date'][:-1]) + timedelta(seconds=time.timezone) date = itm['date']
dt = dt.strftime('%b %d, %Y')
self.log('Downloading ', dt)
self.timefmt = ' [' + dt + ']'
break break
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y')
self.log('Downloading ', dt)
self.timefmt = ' [' + dt + ']'
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) manif = json.loads(self.index_to_soup(index + manifest, raw=True))
@ -179,6 +180,10 @@ class WSJ(BasicNewsRecipe):
h1 = soup.find('h1') h1 = soup.find('h1')
if h1: if h1:
h1['title'] = url['content'] h1['title'] = url['content']
h2 = soup.find('h2')
if h2:
h2['id'] = 'subhed'
h2.name = 'p'
return soup.prettify() return soup.prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):