This commit is contained in:
Kovid Goyal 2024-07-24 11:47:19 +05:30
commit d9561e2321
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 88 additions and 28 deletions

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
import re import re
from collections import defaultdict from collections import defaultdict
from datetime import date from datetime import date
@ -77,12 +79,23 @@ class barrons(BasicNewsRecipe):
] ]
return br return br
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYYMMDD format)',
'long': 'For example, 20240722.\nIf it didn\'t work, try again later.'
}
}
def parse_index(self): def parse_index(self):
self.log( self.log(
'\n***\nif this recipe fails, report it on: ' '\n***\nif this recipe fails, report it on: '
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
) )
archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y')) issue_url = 'https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y')'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
issue_url = 'https://www.barrons.com/magazine?archives=' + d
archive = self.index_to_soup(issue_url)
issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--')) issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--'))
self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']' self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']'
self.description = self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--headline--'))) self.description = self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--headline--')))

View File

@ -51,14 +51,14 @@ class BT(BasicNewsRecipe):
'\n***\nif this recipe fails, report it on: ' '\n***\nif this recipe fails, report it on: '
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
) )
soup = self.index_to_soup('https://www.businesstoday.in')
a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1]
url = a['href']
d = self.recipe_specific_options.get('date') d = self.recipe_specific_options.get('date')
if d and isinstance(d, str): if d and isinstance(d, str):
url = 'https://www.businesstoday.in/magazine/issue/' + d url = 'https://www.businesstoday.in/magazine/issue/' + d
else: else:
soup = self.index_to_soup('https://www.businesstoday.in')
a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1]
url = a['href']
self.cover_url = a.img['data-src'].split('?')[0] self.cover_url = a.img['data-src'].split('?')[0]
self.log('issue =', url) self.log('issue =', url)

View File

@ -55,7 +55,10 @@ class ht(BasicNewsRecipe):
if p and isinstance(p, str): if p and isinstance(p, str):
today = p today = p
self.timefmt = ' [%s]' % today day, month, year = (int(x) for x in today.split('/'))
dt = date(year, month, day)
self.timefmt = ' [%s]' % dt.strftime('%b %d, %Y')
today = today.replace('/', '%2F') today = today.replace('/', '%2F')

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
''' '''
https://www.cirsd.org/en/horizons https://www.cirsd.org/en/horizons
''' '''
@ -29,22 +31,33 @@ class horizons(BasicNewsRecipe):
dict(name='div', attrs={'class':'single-post-footer'}) dict(name='div', attrs={'class':'single-post-footer'})
] ]
recipe_specific_options = {
'issue_url': {
'short': 'The issue URL ',
'long': 'For example, https://www.cirsd.org/en/horizons/horizons-winter-2024--issue-no-25',
}
}
def get_browser(self): def get_browser(self):
return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://www.cirsd.org/en/horizons') d = self.recipe_specific_options.get('issue_url')
a = soup.findAll('a', href=True, attrs={'class':'horizon-gallery-box'})[0] #use 1 for previous edition if d and isinstance(d, str):
url = a['href'] url = d
if url.startswith('/'): else:
url = 'https://www.cirsd.org' + url soup = self.index_to_soup('https://www.cirsd.org/en/horizons')
self.cover_url = a.find('img')['src'] a = soup.findAll('a', href=True, attrs={'class':'horizon-gallery-box'})[0] #use 1 for previous edition
self.log(self.cover_url) url = a['href']
issue = a.find('div', attrs={'class':'horizon-gallery-title'}) if url.startswith('/'):
if issue: url = 'https://www.cirsd.org' + url
self.title = self.tag_to_string(issue).strip() self.cover_url = a.find('img')['src']
self.timefmt = ' [' + self.tag_to_string(issue).strip().replace('Horizons ', '') + ']' self.log(self.cover_url)
self.log('Downloading Issue: ', self.timefmt, self.title) issue = a.find('div', attrs={'class':'horizon-gallery-title'})
if issue:
self.title = self.tag_to_string(issue).strip()
self.timefmt = ' [' + self.tag_to_string(issue).strip().replace('Horizons ', '') + ']'
self.log('Downloading Issue: ', self.timefmt, self.title)
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
feeds = [] feeds = []

View File

@ -97,9 +97,8 @@ class LiveMint(BasicNewsRecipe):
.summary, .highlights, .synopsis { .summary, .highlights, .synopsis {
font-weight:normal !important; font-style:italic; color:#202020; font-weight:normal !important; font-style:italic; color:#202020;
} }
h2 {font-size:normal !important;}
em, blockquote {color:#202020;} em, blockquote {color:#202020;}
.moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag {font-size:small;} .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;}
''' '''
keep_only_tags = [ keep_only_tags = [
@ -109,12 +108,15 @@ class LiveMint(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
dict(name=['meta', 'link', 'svg', 'button', 'iframe']), dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
dict(attrs={'class':lambda x: x and x.startswith(
('storyPage_alsoRead__', 'storyPage_firstPublishDate__', 'storyPage_bcrumb__')
)}),
dict(attrs={'id':['faqSection', 'seoText', 'ellipsisId']}),
classes( classes(
'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider' 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec'
' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget' ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn' ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn trade'
), )
dict(attrs={'class':lambda x: x and x.startswith('storyPage_alsoRead__')})
] ]
feeds = [ feeds = [
@ -160,22 +162,36 @@ class LiveMint(BasicNewsRecipe):
return raw return raw
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'):
h2.name = 'h4'
auth = soup.find(attrs={'class':lambda x: x and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__'))})
if auth:
auth['class'] = 'auth'
summ = soup.find(attrs={'class':lambda x: x and x.startswith('storyPage_summary__')})
if summ:
summ['class'] = 'summary'
for strong in soup.findAll('strong'): for strong in soup.findAll('strong'):
if strong.find('p'): if strong.find('p'):
strong.name = 'div' strong.name = 'div'
for embed in soup.findAll('div', attrs={'class':'embed'}): for embed in soup.findAll('div', attrs={'class':'embed'}):
if nos := embed.find('noscript'): nos = embed.find('noscript')
if nos:
nos.name = 'span' nos.name = 'span'
for span in soup.findAll('figcaption'): for span in soup.findAll('figcaption'):
span['id'] = 'img-cap' span['id'] = 'img-cap'
for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}): for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}):
auth.name = 'div' auth.name = 'div'
for span in soup.findAll('span', attrs={'class':'exclusive'}):
span.extract()
for img in soup.findAll('img', attrs={'data-src': True}): for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src'] img['src'] = img['data-src']
for span in soup.findAll('span', attrs={'class':'exclusive'}):
span.extract()
for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
pa = al.findParent('p')
if pa:
pa.extract()
if wa := soup.find(**classes('autobacklink-topic')): if wa := soup.find(**classes('autobacklink-topic')):
if p := wa.findParent('p'): p = wa.findParent('p')
if p:
p.extract() p.extract()
return soup return soup

View File

@ -67,7 +67,18 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1] img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1]
return soup return soup
recipe_specific_options = {
'issue_url': {
'short': 'The issue URL ',
'long': 'For example, https://www.lrb.co.uk/the-paper/v46/n01',
'default': 'https://www.lrb.co.uk/the-paper/'
}
}
def parse_index(self): def parse_index(self):
d = self.recipe_specific_options.get('issue_url')
if d and isinstance(d, str):
self.INDEX = d
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
container = soup.find('div', attrs={'class': 'article-issue-cover-image'}) container = soup.find('div', attrs={'class': 'article-issue-cover-image'})
if container: if container:

View File

@ -168,7 +168,11 @@ class WSJ(BasicNewsRecipe):
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) sec_parse = json.loads(self.index_to_soup(index + v, raw=True))
data = sec_parse['articles'] data = sec_parse['articles']
for art in data: for art in data:
dt = datetime.fromtimestamp(data[art]['pubdateNumber'] + time.timezone) try:
tme = data[art]['pubdateNumber']
except Exception:
tme = data[art]['origPubdateNumber']
dt = datetime.fromtimestamp(tme + time.timezone)
if (datetime.now() - dt) > timedelta(self.oldest_article): if (datetime.now() - dt) > timedelta(self.oldest_article):
continue continue
title = data[art]['headline'] title = data[art]['headline']