Update Barrons

Fixes #1308140 [Private bug](https://bugs.launchpad.net/calibre/+bug/1308140)
2025-08-30 23:00:21 -04:00 · 2014-04-16 09:06:54 +05:30 · 2014-04-16 09:06:54 +05:30 · 112e38cb54
commit 112e38cb54
parent ac6af50422
1 changed files with 62 additions and 113 deletions
--- a/recipes/barrons.recipe
+++ b/recipes/barrons.recipe
@ -1,129 +1,78 @@
 ##
 ##    web2lrf profile to download articles from Barrons.com
 ##    can download subscriber-only content if username and
 ##    password are supplied.
 ##
 '''
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class Barrons(BasicNewsRecipe):
-        title = 'Barron\'s'
+    title = 'Barron\'s'
-        max_articles_per_feed = 50
+    max_articles_per_feed = 50
-        needs_subscription    = True
+    needs_subscription    = True
-        language = 'en'
+    language = 'en'
-        __author__ = 'Kovid Goyal and Sujata Raman'
+    __author__ = 'Kovid Goyal'
-        description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
+    description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
-        timefmt  = ' [%a, %b %d, %Y]'
+    timefmt  = ' [%a, %b %d, %Y]'
-        use_embedded_content   = False
+    use_embedded_content   = False
-        no_stylesheets = True
+    no_stylesheets = True
-        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
+    match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
-        conversion_options = {'linearize_tables': True}
+    conversion_options = {'linearize_tables': True}
-        ##delay = 1
+    ##delay = 1
-        ## Don't grab articles more than 7 days old
+    # Don't grab articles more than 7 days old
-        oldest_article = 7
+    oldest_article = 7
-        use_javascript_to_login = True
+    use_javascript_to_login = True
-        requires_version = (0, 9, 16)
+    requires_version = (0, 9, 16)
-        extra_css = '''
+    keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
-                    .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
+    remove_tags = [
-                    h3{font-family:Georgia,"Times New Roman",Times,serif; }
+        dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
-                    h2{font-family:Georgia,"Times New Roman",Times,serif; }
+        dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
-                    h1{ font-family:Georgia,"Times New Roman",Times,serif; }
+        dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
-                    .byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
+        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
-                    .subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
+    ]
                    .articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
                    .insettipUnit{font-size: x-small;}
                    '''
        remove_tags = [
                           dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
                           dict(name = 'a', attrs ={'class':'insetClose'})
                        ]
-        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
+    def javascript_login(self, br, username, password):
-                [
+        br.visit('http://commerce.barrons.com/auth/login')
-                ## Remove anything before the body of the article.
+        f = br.select_form(nr=0)
-                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
+        f['username'] = username
        f['password'] = password
        br.submit(timeout=120)
-                ## Remove any insets from the body of the article.
+    # Use the print version of a page when available.
-                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
+    def print_version(self, url):
        main, sep, rest = url.rpartition('?')
        return main + '#text.print'
-                ## Remove any reprint info from the body of the article.
+    def preprocess_html(self, soup):
-                (r'<hr size.*?<p', lambda match : '<p'),
+        # Remove thumbnail for zoomable images
        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
            img = div.find('img')
            if img is not None:
                img.extract()
-                ## Remove anything after the end of the article.
+        return soup
-                (r'<!-- article end.*?</body>', lambda match : '</body>'),
+
-                ]
+# Comment out the feeds you don't want retrieved.
 # Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
    def get_feeds(self):
        return [
        ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
        ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
        ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
        ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
        ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
        ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
        ]
-        def javascript_login(self, br, username, password):
+    def get_article_url(self, article):
-            br.visit('http://commerce.barrons.com/auth/login')
+        return article.get('link', None)
            f = br.select_form(nr=0)
            f['username'] = username
            f['password'] = password
            br.submit(timeout=120)
        ## Use the print version of a page when available.
        def print_version(self, url):
               main, sep, rest = url.rpartition('?')
               return main + '#text.print'
        def postprocess_html(self, soup, first):
               for tag in soup.findAll(name=['ul', 'li']):
                    tag.name = 'div'
               for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
                  tag.extract()
               return soup
 ## Comment out the feeds you don't want retrieved.
 ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
        def get_feeds(self):
                return  [
                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
                ]
        def get_article_url(self, article):
            return article.get('link', None)
        def get_cover_url(self):
            cover_url = None
            index = 'http://online.barrons.com/home-page'
            soup = self.index_to_soup(index)
            link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
            if link_item:
               cover_url = link_item.img['src']
            return cover_url
        ## Logout of website
        ## NOT CURRENTLY WORKING
        # def cleanup(self):
            # try:
                # self.browser.set_debug_responses(True)
                # import sys, logging
                # logger = logging.getLogger("mechanize")
                # logger.addHandler(logging.StreamHandler(sys.stdout))
                # logger.setLevel(logging.INFO)
                # res = self.browser.open('http://online.barrons.com/logout')
            # except:
                # import traceback
                # traceback.print_exc()
    def get_cover_url(self):
        cover_url = None
        index = 'http://online.barrons.com/home-page'
        soup = self.index_to_soup(index)
        link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
        if link_item:
            cover_url = link_item.img['src']
        return cover_url