Update barrons.recipe

2025-08-30 23:00:21 -04:00 · 2024-01-21 12:26:02 +05:30 · 2024-01-21 12:26:02 +05:30 · fae136f103
commit fae136f103
parent dff94f6f62
1 changed files with 86 additions and 111 deletions
--- a/recipes/barrons.recipe
+++ b/recipes/barrons.recipe
@ -1,122 +1,97 @@
-#!/usr/bin/env python
+from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
-# vim:fileencoding=utf-8
+from datetime import date
-# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
+import re
 from __future__ import absolute_import, division, print_function, unicode_literals
 import json
-from mechanize import Request
+class barrons(BasicNewsRecipe):
 from calibre import random_user_agent
 from calibre.web.feeds.news import BasicNewsRecipe
 from base64 import standard_b64encode
 try:
    import urllib.parse as urlparse
 except ImportError:
    import urlparse
 try:
    from urllib.parse import quote
 except ImportError:
    from urllib import quote
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 MAGAZINE_INDEX = 'https://www.barrons.com/magazine'
 class BarronsMagazine(BasicNewsRecipe):
    title = 'Barron\'s Magazine'
-    __author__ = 'Kovid Goyal'
+    __author__ = 'unkn0wn'
-    description = 'Financial news from the publisher of the WSJ'
+    description = (
-    language = 'en'
+        'Barron\'s is an American weekly magazine/newspaper published by Dow Jones & Company. Founded in 1921 as a sister '
-    needs_subscription = True
+        'publication to The Wall Street Journal, Barron\'s covers U.S. financial information, market developments, and '
        'relevant statistics.'
    )
    language = 'en_US'
    use_embedded_content = False
    no_stylesheets = True
    remove_javascript = True
    remove_attributes = ['height', 'width', 'style']
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url'}
    masthead_url = 'https://www.barrons.com/asset/barrons/images/barrons-logo.png'
    delay = 1
    extra_css = '''
        img {display:block; margin:0 auto;}
        .figc { font-size:small; text-align:center; }
        .imageCredit { color:#404040; font-size:x-small; }
        .headline__category { font-size:small; color:#404040; }
        .sub-head { color:#202020; }
    '''
    keep_only_tags = [
-        classes('article__headline article__body'),
+        classes('headline articleLead'),
        dict(name='section', attrs={'subscriptions-section':'content'})
    ]
    remove_tags = [
        dict(name=['meta', 'link', 'svg', 'button', 'i-amphtml-sizer']),
        classes('wsj-ad dynamic-inset-overflow')
    ]
-    def get_browser(self, *a, **kw):
+    def preprocess_html(self, soup):
-        # To understand the login logic read app-min.js from
+        for figc in soup.findAll('figcaption'):
-        # https://sso.accounts.dowjones.com/login
+            figc['class'] = 'figc'
-        kw['user_agent'] = random_user_agent(allow_ie=False)
+            for p in figc.findAll('p'):
-        br = super().get_browser(*a, **kw)
+                p.name = 'div'
-        if not self.username or not self.password:
+        for h2 in soup.findAll('h2'):
-            self.barrons_itp_page = br.open(MAGAZINE_INDEX).read()
+            h2.name = 'h4'
-            return br
+        for iframe in soup.findAll('amp-iframe'):
-        itp = quote(MAGAZINE_INDEX, safe='')
+            wsj = iframe.find('amp-img')
-        start_url = 'https://accounts.barrons.com/login?target=' + itp
+            if wsj:
-        self.log('Starting login process...')
+                wsj.decompose()
-        res = br.open(start_url)
+            data = re.search(r'datawrapper-chart-(.{5})', iframe['src'])
-        sso_url = res.geturl()
+            if data:
-        query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
+                iframe.name = 'img'
-        query = {k:v[0] for k, v in query.items()}
+                iframe['src'] = 'https://datawrapper.dwcdn.net/' + data.group(1) + '/full.png'
-        request_query = {
+        for amp in soup.findAll('amp-img'):
-            'username': self.username,
+            if not amp.find('img', attrs={'src':True}):
-            'password': self.password,
+                amp.name = 'img'
-            'client_id': query['client'],
+        return soup
-            'sso': 'true',
+
-            'tenant': 'sso',
+    def get_browser(self, *args, **kwargs):
-            '_intstate': 'deprecated',
+        kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
-            'connection': 'DJldap',
+        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
-        }
+        br.addheaders += [
-        for cookie in br.cookiejar:
+            ('Referer', 'https://www.google.com/'),
-            if cookie.name in ('_csrf', 'csrf'):
+            ('X-Forwarded-For', '66.249.66.1')
-                request_query['_csrf'] = cookie.value
+        ]
        for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
            if k in query:
                request_query[k] = query[k]
        login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
        # you can get the version below from lib-min.js
        # search for: str: "x.x.x"
        # This might need to be updated in the future
        auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
        if not isinstance(auth0_client, bytes):
            auth0_client = auth0_client.encode('utf-8')
        auth0_client = standard_b64encode(auth0_client)
        if isinstance(auth0_client, bytes):
            auth0_client = auth0_client.decode('ascii')
        rq = Request(login_url, headers={
            'Accept': 'text/html',
            'Accept-Language': 'en-US,en;q=0.8',
            'Auth0-Client': auth0_client.rstrip('='),
            'X-HTTP-Method-Override': 'POST',
            'X-Requested-With': 'XMLHttpRequest',
            'X-Remote-User': self.username
        }, data=request_query)
        self.log('Sending login request...')
        try:
            res = br.open(rq)
        except Exception as err:
            if hasattr(err, 'read'):
                raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
            raise
        if res.code != 200:
            raise ValueError('Failed to login, check your username and password')
        br.select_form(nr=0)
        self.log('Performing login callback...')
        res = br.submit()
        self.barrons_itp_page = raw = res.read()
        if b'/logout' not in raw:
            raise ValueError(
                'Failed to login (callback URL failed), check username and password')
        return br
    def parse_index(self):
-        soup = self.index_to_soup(self.barrons_itp_page)
+        archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y'))
-        articles = []
+        issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--'))
-        for art in soup.findAll('article'):
+        self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']'
-            h = art.find(['h2', 'h3'])
+        self.log(self.timefmt)
-            a = h.find('a')
+        self.cover_url = issue.img['src'].split('?')[0]
-            title = self.tag_to_string(a)
+
-            url = a['href']
+        ans = []
        for articles in archive.findAll(**prefixed_classes('BarronsTheme--story--')):
            a = articles.find(**prefixed_classes('BarronsTheme--heading'))
            title = self.tag_to_string(a).strip()
            url = a.a['href']
            desc = ''
-            p = art.find('p', attrs={'class': lambda x: x and ('_summary_' in x or '_byline_' in x)})
+            byl = articles.find(**prefixed_classes('BarronsTheme--byline--'))
-            if p:
+            if byl:
-                desc += self.tag_to_string(p)
+                desc += self.tag_to_string(byl)
-            articles.append({'title': title, 'url': url, 'description': desc})
+            ttr =  articles.find(**prefixed_classes('BarronsTheme--time-to-read--'))
-        return [('Articles', articles)]
+            if ttr:
                desc += self.tag_to_string(ttr)
            summ = articles.find(**prefixed_classes('BarronsTheme--summary--'))
            if summ:
                desc += ' | ' + self.tag_to_string(summ)
            self.log('\t', title, ' ', url, '\n\t', desc)
            ans.append({'title': title, 'url': url, 'description': desc})
        return [('Articles', ans)]
    def print_version(self, url):
        return url.split('?')[0].replace('/articles/', '/amp/articles/')