from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes from collections import defaultdict from datetime import date import re class barrons(BasicNewsRecipe): title = 'Barron\'s Magazine' __author__ = 'unkn0wn' description = ( 'Barron\'s is an American weekly magazine/newspaper published by Dow Jones & Company. Founded in 1921 as a sister ' 'publication to The Wall Street Journal, Barron\'s covers U.S. financial information, market developments, and ' 'relevant statistics.' ) language = 'en_US' use_embedded_content = False no_stylesheets = True remove_javascript = True remove_attributes = ['height', 'width', 'style'] encoding = 'utf-8' ignore_duplicate_articles = {'url'} masthead_url = 'https://www.barrons.com/asset/barrons/images/barrons-logo.png' delay = 1 extra_css = ''' img {display:block; margin:0 auto;} .figc { font-size:small; text-align:center; } .imageCredit { color:#404040; font-size:x-small; } .headline__category, .article-prebody { font-size:small; color:#404040; } .sub-head { color:#202020; } ''' keep_only_tags = [ classes('headline articleLead article-prebody'), dict(name='section', attrs={'subscriptions-section':'content'}) ] remove_tags = [ dict(name=['meta', 'link', 'svg', 'button', 'i-amphtml-sizer']), classes('wsj-ad dynamic-inset-overflow newsletter-inset') ] def preprocess_html(self, soup): for figc in soup.findAll('figcaption'): figc['class'] = 'figc' for p in figc.findAll('p'): p.name = 'div' for by in soup.findAll(**classes('byline')): for p in by.findAll('p'): p.name = 'span' for h2 in soup.findAll('h2'): h2.name = 'h4' for iframe in soup.findAll('amp-iframe'): wsj = iframe.find('amp-img') if wsj: wsj.decompose() data = re.search(r'datawrapper-chart-(.{5})', iframe['src']) if data: iframe.name = 'img' iframe['src'] = 'https://datawrapper.dwcdn.net/' + data.group(1) + '/full.png' for amp in soup.findAll('amp-img'): if not amp.find('img', attrs={'src':True}): if amp.has_attr('src'): amp['src'] = amp['src'] + '&pixel_ratio=1.5' amp.name = 'img' else: amp.img['src'] = amp.img['src'] + '&pixel_ratio=1.5' return soup def get_browser(self, *args, **kwargs): kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br.addheaders += [ ('Referer', 'https://www.google.com/'), ('X-Forwarded-For', '66.249.66.1') ] return br def parse_index(self): archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y')) issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--')) self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']' self.log(self.timefmt) self.cover_url = issue.img['src'].split('?')[0] ans = defaultdict(list) for articles in archive.findAll(**prefixed_classes('BarronsTheme--story--')): section = 'Magazine' strap = articles.find_previous_sibling(**prefixed_classes('BarronsTheme--strap--')) if strap: label = strap.find(**prefixed_classes('BarronsTheme--label--')) if label: section = self.tag_to_string(label).strip() a = articles.find(**prefixed_classes('BarronsTheme--heading')) title = self.tag_to_string(a).strip() url = a.a['href'] desc = '' byl = articles.find(**prefixed_classes('BarronsTheme--byline--')) if byl: desc += self.tag_to_string(byl) ttr = articles.find(**prefixed_classes('BarronsTheme--time-to-read--')) if ttr: desc += self.tag_to_string(ttr) summ = articles.find(**prefixed_classes('BarronsTheme--summary--')) if summ: desc += ' | ' + self.tag_to_string(summ) self.log('\t', title, ' ', url, '\n\t', desc) ans[section].append({'title': title, 'url': url, 'description': desc}) return [(section, articles) for section, articles in ans.items()] def print_version(self, url): return url.split('?')[0].replace('/articles/', '/amp/articles/')