From ad85cc973437410daa1086ee43f46f727168d6c7 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 1 Feb 2024 18:04:02 +0530 Subject: [PATCH] Update barrons.recipe --- recipes/barrons.recipe | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 522fae16b1..923e66d6e5 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes +from collections import defaultdict from datetime import date import re @@ -24,17 +25,17 @@ class barrons(BasicNewsRecipe): img {display:block; margin:0 auto;} .figc { font-size:small; text-align:center; } .imageCredit { color:#404040; font-size:x-small; } - .headline__category { font-size:small; color:#404040; } + .headline__category, .article-prebody { font-size:small; color:#404040; } .sub-head { color:#202020; } ''' keep_only_tags = [ - classes('headline articleLead'), + classes('headline articleLead article-prebody'), dict(name='section', attrs={'subscriptions-section':'content'}) ] remove_tags = [ dict(name=['meta', 'link', 'svg', 'button', 'i-amphtml-sizer']), - classes('wsj-ad dynamic-inset-overflow') + classes('wsj-ad dynamic-inset-overflow newsletter-inset') ] def preprocess_html(self, soup): @@ -42,6 +43,9 @@ class barrons(BasicNewsRecipe): figc['class'] = 'figc' for p in figc.findAll('p'): p.name = 'div' + for by in soup.findAll(**classes('byline')): + for p in by.findAll('p'): + p.name = 'span' for h2 in soup.findAll('h2'): h2.name = 'h4' for iframe in soup.findAll('amp-iframe'): @@ -54,7 +58,11 @@ class barrons(BasicNewsRecipe): iframe['src'] = 'https://datawrapper.dwcdn.net/' + data.group(1) + '/full.png' for amp in soup.findAll('amp-img'): if not amp.find('img', attrs={'src':True}): - amp.name = 'img' + if amp.has_attr('src'): + amp['src'] = amp['src'] + '&pixel_ratio=1.5' + amp.name = 'img' + else: + amp.img['src'] = amp.img['src'] + '&pixel_ratio=1.5' return soup def get_browser(self, *args, **kwargs): @@ -73,9 +81,15 @@ class barrons(BasicNewsRecipe): self.log(self.timefmt) self.cover_url = issue.img['src'].split('?')[0] - ans = [] + ans = defaultdict(list) for articles in archive.findAll(**prefixed_classes('BarronsTheme--story--')): + section = 'Magazine' + strap = articles.find_previous_sibling(**prefixed_classes('BarronsTheme--strap--')) + if strap: + label = strap.find(**prefixed_classes('BarronsTheme--label--')) + if label: + section = self.tag_to_string(label).strip() a = articles.find(**prefixed_classes('BarronsTheme--heading')) title = self.tag_to_string(a).strip() url = a.a['href'] @@ -90,8 +104,8 @@ class barrons(BasicNewsRecipe): if summ: desc += ' | ' + self.tag_to_string(summ) self.log('\t', title, ' ', url, '\n\t', desc) - ans.append({'title': title, 'url': url, 'description': desc}) - return [('Articles', ans)] + ans[section].append({'title': title, 'url': url, 'description': desc}) + return [(section, articles) for section, articles in ans.items()] def print_version(self, url): return url.split('?')[0].replace('/articles/', '/amp/articles/')