This commit is contained in:
Kovid Goyal 2024-02-01 19:13:12 +05:30
commit c48d0700a4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 25 additions and 9 deletions

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
from collections import defaultdict
from datetime import date from datetime import date
import re import re
@ -24,17 +25,17 @@ class barrons(BasicNewsRecipe):
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
.figc { font-size:small; text-align:center; } .figc { font-size:small; text-align:center; }
.imageCredit { color:#404040; font-size:x-small; } .imageCredit { color:#404040; font-size:x-small; }
.headline__category { font-size:small; color:#404040; } .headline__category, .article-prebody { font-size:small; color:#404040; }
.sub-head { color:#202020; } .sub-head { color:#202020; }
''' '''
keep_only_tags = [ keep_only_tags = [
classes('headline articleLead'), classes('headline articleLead article-prebody'),
dict(name='section', attrs={'subscriptions-section':'content'}) dict(name='section', attrs={'subscriptions-section':'content'})
] ]
remove_tags = [ remove_tags = [
dict(name=['meta', 'link', 'svg', 'button', 'i-amphtml-sizer']), dict(name=['meta', 'link', 'svg', 'button', 'i-amphtml-sizer']),
classes('wsj-ad dynamic-inset-overflow') classes('wsj-ad dynamic-inset-overflow newsletter-inset')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -42,6 +43,9 @@ class barrons(BasicNewsRecipe):
figc['class'] = 'figc' figc['class'] = 'figc'
for p in figc.findAll('p'): for p in figc.findAll('p'):
p.name = 'div' p.name = 'div'
for by in soup.findAll(**classes('byline')):
for p in by.findAll('p'):
p.name = 'span'
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'
for iframe in soup.findAll('amp-iframe'): for iframe in soup.findAll('amp-iframe'):
@ -54,7 +58,11 @@ class barrons(BasicNewsRecipe):
iframe['src'] = 'https://datawrapper.dwcdn.net/' + data.group(1) + '/full.png' iframe['src'] = 'https://datawrapper.dwcdn.net/' + data.group(1) + '/full.png'
for amp in soup.findAll('amp-img'): for amp in soup.findAll('amp-img'):
if not amp.find('img', attrs={'src':True}): if not amp.find('img', attrs={'src':True}):
amp.name = 'img' if amp.has_attr('src'):
amp['src'] = amp['src'] + '&pixel_ratio=1.5'
amp.name = 'img'
else:
amp.img['src'] = amp.img['src'] + '&pixel_ratio=1.5'
return soup return soup
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -73,9 +81,15 @@ class barrons(BasicNewsRecipe):
self.log(self.timefmt) self.log(self.timefmt)
self.cover_url = issue.img['src'].split('?')[0] self.cover_url = issue.img['src'].split('?')[0]
ans = [] ans = defaultdict(list)
for articles in archive.findAll(**prefixed_classes('BarronsTheme--story--')): for articles in archive.findAll(**prefixed_classes('BarronsTheme--story--')):
section = 'Magazine'
strap = articles.find_previous_sibling(**prefixed_classes('BarronsTheme--strap--'))
if strap:
label = strap.find(**prefixed_classes('BarronsTheme--label--'))
if label:
section = self.tag_to_string(label).strip()
a = articles.find(**prefixed_classes('BarronsTheme--heading')) a = articles.find(**prefixed_classes('BarronsTheme--heading'))
title = self.tag_to_string(a).strip() title = self.tag_to_string(a).strip()
url = a.a['href'] url = a.a['href']
@ -90,8 +104,8 @@ class barrons(BasicNewsRecipe):
if summ: if summ:
desc += ' | ' + self.tag_to_string(summ) desc += ' | ' + self.tag_to_string(summ)
self.log('\t', title, ' ', url, '\n\t', desc) self.log('\t', title, ' ', url, '\n\t', desc)
ans.append({'title': title, 'url': url, 'description': desc}) ans[section].append({'title': title, 'url': url, 'description': desc})
return [('Articles', ans)] return [(section, articles) for section, articles in ans.items()]
def print_version(self, url): def print_version(self, url):
return url.split('?')[0].replace('/articles/', '/amp/articles/') return url.split('?')[0].replace('/articles/', '/amp/articles/')

View File

@ -102,7 +102,7 @@ class LiveMint(BasicNewsRecipe):
dict(name=['meta', 'link', 'svg', 'button', 'iframe']), dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
classes( classes(
'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight' 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight'
' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot' ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo'
' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText' ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText'
) )
] ]

View File

@ -154,7 +154,9 @@ class NatGeo(BasicNewsRecipe):
if photoart := soup.find(attrs={'class':lambda x: x and 'BgImagePromo__Container__Text__Link' in x.split()}): if photoart := soup.find(attrs={'class':lambda x: x and 'BgImagePromo__Container__Text__Link' in x.split()}):
ans2 = [] ans2 = []
title = self.tag_to_string(photoart) title = self.tag_to_string(photoart)
url = 'https://www.nationalgeographic.com' + photoart['href'] url = photoart['href']
if url.startswith('/'):
url = 'https://www.nationalgeographic.com' + photoart['href']
ans2.append(('Photo Essay', [{'title': title, 'url': url}])) ans2.append(('Photo Essay', [{'title': title, 'url': url}]))
for gird in soup.findAll(attrs={'class':'GridPromoTile'}): for gird in soup.findAll(attrs={'class':'GridPromoTile'}):
for article in soup.findAll('article'): for article in soup.findAll('article'):