diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 522fae16b1..923e66d6e5 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes +from collections import defaultdict from datetime import date import re @@ -24,17 +25,17 @@ class barrons(BasicNewsRecipe): img {display:block; margin:0 auto;} .figc { font-size:small; text-align:center; } .imageCredit { color:#404040; font-size:x-small; } - .headline__category { font-size:small; color:#404040; } + .headline__category, .article-prebody { font-size:small; color:#404040; } .sub-head { color:#202020; } ''' keep_only_tags = [ - classes('headline articleLead'), + classes('headline articleLead article-prebody'), dict(name='section', attrs={'subscriptions-section':'content'}) ] remove_tags = [ dict(name=['meta', 'link', 'svg', 'button', 'i-amphtml-sizer']), - classes('wsj-ad dynamic-inset-overflow') + classes('wsj-ad dynamic-inset-overflow newsletter-inset') ] def preprocess_html(self, soup): @@ -42,6 +43,9 @@ class barrons(BasicNewsRecipe): figc['class'] = 'figc' for p in figc.findAll('p'): p.name = 'div' + for by in soup.findAll(**classes('byline')): + for p in by.findAll('p'): + p.name = 'span' for h2 in soup.findAll('h2'): h2.name = 'h4' for iframe in soup.findAll('amp-iframe'): @@ -54,7 +58,11 @@ class barrons(BasicNewsRecipe): iframe['src'] = 'https://datawrapper.dwcdn.net/' + data.group(1) + '/full.png' for amp in soup.findAll('amp-img'): if not amp.find('img', attrs={'src':True}): - amp.name = 'img' + if amp.has_attr('src'): + amp['src'] = amp['src'] + '&pixel_ratio=1.5' + amp.name = 'img' + else: + amp.img['src'] = amp.img['src'] + '&pixel_ratio=1.5' return soup def get_browser(self, *args, **kwargs): @@ -73,9 +81,15 @@ class barrons(BasicNewsRecipe): self.log(self.timefmt) self.cover_url = issue.img['src'].split('?')[0] - ans = [] + ans = defaultdict(list) for articles in archive.findAll(**prefixed_classes('BarronsTheme--story--')): + section = 'Magazine' + strap = articles.find_previous_sibling(**prefixed_classes('BarronsTheme--strap--')) + if strap: + label = strap.find(**prefixed_classes('BarronsTheme--label--')) + if label: + section = self.tag_to_string(label).strip() a = articles.find(**prefixed_classes('BarronsTheme--heading')) title = self.tag_to_string(a).strip() url = a.a['href'] @@ -90,8 +104,8 @@ class barrons(BasicNewsRecipe): if summ: desc += ' | ' + self.tag_to_string(summ) self.log('\t', title, ' ', url, '\n\t', desc) - ans.append({'title': title, 'url': url, 'description': desc}) - return [('Articles', ans)] + ans[section].append({'title': title, 'url': url, 'description': desc}) + return [(section, articles) for section, articles in ans.items()] def print_version(self, url): return url.split('?')[0].replace('/articles/', '/amp/articles/') diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index ab82b0bc45..7147c72c78 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -102,7 +102,7 @@ class LiveMint(BasicNewsRecipe): dict(name=['meta', 'link', 'svg', 'button', 'iframe']), classes( 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight' - ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot' + ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo' ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText' ) ] diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index 4140a45777..fa866a2ec9 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -154,7 +154,9 @@ class NatGeo(BasicNewsRecipe): if photoart := soup.find(attrs={'class':lambda x: x and 'BgImagePromo__Container__Text__Link' in x.split()}): ans2 = [] title = self.tag_to_string(photoart) - url = 'https://www.nationalgeographic.com' + photoart['href'] + url = photoart['href'] + if url.startswith('/'): + url = 'https://www.nationalgeographic.com' + photoart['href'] ans2.append(('Photo Essay', [{'title': title, 'url': url}])) for gird in soup.findAll(attrs={'class':'GridPromoTile'}): for article in soup.findAll('article'):