Fix #3987 (Barrons.com is not properly parsed anymore)

2025-08-11 09:13:57 -04:00 · 2009-11-13 09:43:56 -07:00 · 2009-11-13 09:43:56 -07:00 · fb0062dd69
commit fb0062dd69
parent 6e47d77a2a
1 changed files with 38 additions and 3 deletions
--- a/resources/recipes/barrons.recipe
+++ b/resources/recipes/barrons.recipe
@ -21,7 +21,7 @@ class Barrons(BasicNewsRecipe):
        description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
        timefmt  = ' [%a, %b %d, %Y]'
        use_embedded_content   = False
-        no_stylesheets = False
+        no_stylesheets = True
        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
        conversion_options = {'linearize_tables': True}
        ##delay = 1
@ -29,6 +29,20 @@ class Barrons(BasicNewsRecipe):
        ## Don't grab articles more than 7 days old
        oldest_article = 7

+        extra_css = '''
+                    .datestamp{color:#666666; font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
+                    h3{color:#FF0000; font-family:Georgia,"Times New Roman",Times,serif; }
+                    h2{font-family:Georgia,"Times New Roman",Times,serif; }
+                    h1{ font-family:Georgia,"Times New Roman",Times,serif; }
+                    .byline{color:#AAAAAA; font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
+                    .subhead{color:#666666; font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
+                    .articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#333333;}
+                    .insettipUnit{font-size: x-small;}
+                    '''
+        remove_tags = [
+                           dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
+                           dict(name = 'a', attrs ={'class':'insetClose'})
+                        ]

        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
                [
@ -56,10 +70,20 @@ class Barrons(BasicNewsRecipe):
                br.submit()
            return br

-## Use the print version of a page when available.
+        ## Use the print version of a page when available.

        def print_version(self, url):
-                return url.replace('/article/', '/article_print/')
+               main, sep, rest = url.rpartition('?')
+               return main + '#printmode'
+            
+        def postprocess_html(self, soup, first):
+
+               for tag in soup.findAll(name=['ul', 'li']):
+                    tag.name = 'div'
+               for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
+                  tag.extract()
+                   
+               return soup

 ## Comment out the feeds you don't want retrieved.
 ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
@ -74,6 +98,17 @@ class Barrons(BasicNewsRecipe):
                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
                ]

+            
+        def get_cover_url(self):
+            cover_url = None
+            index = 'http://online.barrons.com/home-page'
+            soup = self.index_to_soup(index)
+            link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
+            if link_item:
+               cover_url = link_item.img['src']
+            return cover_url
+
+        
        ## Logout of website
        ## NOT CURRENTLY WORKING
        # def cleanup(self):