Add WSJ, Barrons and Portfolio.com profiles

2025-08-30 23:00:21 -04:00 · 2007-12-15 22:07:16 +00:00 · 2007-12-15 22:07:16 +00:00 · 99ceb7a142
commit 99ceb7a142
parent f37d8c9dc4
5 changed files with 269 additions and 3 deletions
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -32,9 +32,13 @@ from libprs500.ebooks.lrf.web.profiles.newyorkreview import NewYorkReviewOfBooks
 from libprs500.ebooks.lrf.web.profiles.spiegelde import SpiegelOnline
 from libprs500.ebooks.lrf.web.profiles.zeitde import ZeitNachrichten
 from libprs500.ebooks.lrf.web.profiles.faznet import FazNet
+from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
+from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
+from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio  

-builtin_profiles   = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \
-                      SpiegelOnline, ZeitNachrichten, FazNet]
+builtin_profiles   = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks,   \
+                      SpiegelOnline, ZeitNachrichten, FazNet, WallStreetJournal, \
+                      Barrons, Portfolio]
 available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] 

 def option_parser():
--- a/src/libprs500/ebooks/lrf/web/profiles/barrons.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/barrons.py
@ -0,0 +1,89 @@
+##
+##    web2lrf profile to download articles from Barrons.com 
+##    can download subscriber-only content if username and  
+##    password are supplied.
+##
+''' 
+''' 
+ 
+import re 
+ 
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
+         
+class Barrons(DefaultProfile): 
+    
+        title = 'Barron\'s' 
+        max_recursions = 3
+        max_articles_per_feed = 50
+        timefmt  = ' [%a, %b %d, %Y]' 
+        html_description = True 
+        no_stylesheets = False
+        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
+        html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
+        ##delay = 1
+        
+        ## Don't grab articles more than 7 days old 
+        oldest_article = 7 
+
+
+        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
+                [ 
+                ## Remove anything before the body of the article. 
+                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
+ 
+                ## Remove any insets from the body of the article. 
+                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
+
+                ## Remove any reprint info from the body of the article. 
+                (r'<hr size.*?<p', lambda match : '<p'), 
+
+                ## Remove anything after the end of the article. 
+                (r'<!-- article end.*?</body>', lambda match : '</body>'), 
+                ] 
+        ] 
+ 
+        def get_browser(self): 
+            br = DefaultProfile.get_browser() 
+            if self.username is not None and self.password is not None: 
+                br.open('http://commerce.barrons.com/auth/login') 
+                br.select_form(name='login_form') 
+                br['user']   = self.username 
+                br['password'] = self.password 
+                br.submit() 
+            return br 
+ 
+## Use the print version of a page when available. 
+ 
+        def print_version(self, url): 
+                return url.replace('/article/', '/article_print/') 
+ 
+## Comment out the feeds you don't want retrieved. 
+## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire 
+ 
+        def get_feeds(self): 
+                return  [ 
+                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), 
+                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), 
+                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), 
+                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), 
+                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), 
+                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), 
+                ]
+
+        ## Logout of website
+        ## NOT CURRENTLY WORKING
+        # def cleanup(self):
+            # try:
+                # self.browser.set_debug_responses(True)
+                # import sys, logging
+                # logger = logging.getLogger("mechanize")
+                # logger.addHandler(logging.StreamHandler(sys.stdout))
+                # logger.setLevel(logging.INFO)
+
+                # res = self.browser.open('http://online.barrons.com/logout')
+            # except:
+                # import traceback
+                # traceback.print_exc()
+
+
+
--- a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
@ -0,0 +1,42 @@
+##
+##    web2lrf profile to download articles from Portfolio.com 
+##
+''' 
+''' 
+ 
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
+         
+class Portfolio(DefaultProfile): 
+    
+        title = 'Portfolio' 
+        max_recursions = 0
+        max_articles_per_feed = 50
+        timefmt  = ' [%a, %b %d, %Y]' 
+        html_description = True 
+        no_stylesheets = True
+        html2lrf_options = ['--ignore-tables']
+        ##delay = 1
+        
+        ## Don't grab articles more than 7 days old 
+        oldest_article = 30
+
+## Comment out the feeds you don't want retrieved. 
+## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire 
+        def get_feeds(self): 
+                return  [ 
+                ('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'), 
+                ('Careers', 'http://feeds.portfolio.com/portfolio/careers'), 
+                ('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'), 
+                ('Executives','http://feeds.portfolio.com/portfolio/executives'), 
+                ('News and Markets', 'http://feeds.portfolio.com/portfolio/news'), 
+                ('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'), 
+                ('Capital', 'http://feeds.portfolio.com/portfolio/capital'), 
+                ('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'), 
+                ('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'), 
+                ('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'), 
+                ('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'), 
+                ('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'), 
+                ('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'), 
+                ('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'), 
+                ]
+
--- a/src/libprs500/ebooks/lrf/web/profiles/wsj.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/wsj.py
@ -0,0 +1,109 @@
+##
+##    web2lrf profile to download articles from WSJ.com 
+##    can download subscriber-only content if username and  
+##    password are supplied.
+##
+''' 
+''' 
+ 
+import re 
+ 
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
+         
+class WallStreetJournal(DefaultProfile): 
+    
+        title = 'Wall Street Journal' 
+        max_recursions = 2 
+        max_articles_per_feed = 50
+        timefmt  = ' [%a, %b %d, %Y]' 
+        html_description = True 
+        no_stylesheets = False
+        html2lrf_options = [('--ignore-tables')]
+
+        ## Don't grab articles more than 7 days old 
+        oldest_article = 7 
+   
+        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
+                [ 
+                ## Remove anything before the body of the article. 
+                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
+ 
+                ## Remove any insets from the body of the article. 
+                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
+ 
+                ## Remove anything after the end of the article. 
+                (r'<!-- article end.*?</body>', lambda match : '</body>'), 
+                ] 
+        ] 
+ 
+        def get_browser(self): 
+            br = DefaultProfile.get_browser() 
+            if self.username is not None and self.password is not None: 
+                br.open('http://online.wsj.com/login') 
+                br.select_form(name='login_form') 
+                br['user']   = self.username 
+                br['password'] = self.password 
+                br.submit() 
+            return br 
+ 
+        def print_version(self, url): 
+                return url.replace('/article/', '/article_print/') 
+ 
+## Comment out the feeds you don't want retrieved. 
+## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them  or use spaces to put them in the order you desire 
+        def get_feeds(self): 
+                return  [ 
+                #('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'), 
+                #('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'), 
+                #('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'), 
+                (' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'), 
+                (' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'), 
+                # ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'), 
+                ('  Today\'s Newspaper -  Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'), 
+                ('  Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'), 
+                ('  Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'), 
+                ('  Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'), 
+                ('  Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'), 
+                ('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'), 
+                ('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'), 
+                ('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'), 
+                ('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'), 
+                ('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'), 
+                ('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'), 
+                ('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'), 
+                ('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'), 
+                ('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'), 
+                ('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'), 
+                ('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'), 
+                ('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'), 
+                ('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'), 
+                ('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'), 
+                ('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'), 
+                ('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'), 
+                ('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'), 
+                ('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'), 
+                ('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'), 
+                ('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'), 
+                ('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'), 
+                ('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'), 
+                ('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'), 
+                ('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'), 
+                ('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'), 
+                ('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'), 
+                ('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'), 
+                ('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'), 
+                ('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'), 
+                # ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'), 
+                ('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'), 
+                ('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'), 
+                ('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'), 
+                ('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'), 
+                # ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'), 
+                # ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'), 
+                ('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'), 
+                ]
+
+## Logout of website
+## NOT CURRENTLY WORKING
+        # def cleanup(self): 
+            # self.browser.open('http://commerce.wsj.com/auth/postlogout') 
--- a/src/libprs500/gui2/news.py
+++ b/src/libprs500/gui2/news.py
@ -26,19 +26,25 @@ class NewsMenu(QMenu):
    
    def __init__(self):
        QMenu.__init__(self)
+        self.add_menu_item('Barrons', self.fetch_news_barrons)
        self.add_menu_item('BBC', self.fetch_news_bbc, ':/images/news/bbc.png')
        self.add_menu_item('Economist', self.fetch_news_economist, ':/images/news/economist.png')
        self.add_menu_item('Faz.net', self.fetch_news_faznet, ':/images/news/faznet.png')
        self.add_menu_item('Newsweek', self.fetch_news_newsweek, ':/images/news/newsweek.png')
        self.add_menu_item('New York Review of Books', self.fetch_news_nyreview, ':/images/book.svg')
        self.add_menu_item('New York Times', self.fetch_news_nytimes, ':/images/news/nytimes.png')
+        self.add_menu_item('Portfolio.com', self.fetch_news_portfolio)
        self.add_menu_item('Spiegel Online', self.fetch_news_spiegelde, ':/images/news/spiegelonline.png')
+        self.add_menu_item('Wall Street Journal', self.fetch_news_wsj)
        self.add_menu_item('Zeit Nachrichten', self.fetch_news_zeitde, ':/images/news/diezeit.png')
        
    def fetch_news(self, profile, title, username=None, password=None):
        data = dict(profile=profile, title=title, username=username, password=password)
        self.emit(SIGNAL('fetch_news(PyQt_PyObject)'), data)
    
+    def fetch_news_portfolio(self, checked):
+        self.fetch_news('portfolio', 'Portfolio.com')
+    
    def fetch_news_spiegelde(self, checked):
        self.fetch_news('spiegelde', 'Spiegel Online')
        
@ -66,4 +72,20 @@ class NewsMenu(QMenu):
        d.exec_()
        if d.result() == QDialog.Accepted:
            un, pw = d.username(), d.password()
-            self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
+            self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
+            
+    def fetch_news_wsj(self, checked):
+        d = PasswordDialog(self, 'wsj info dialog', 
+                           '<p>Please enter your username and password for wsj.com<br>Click OK to proceed.')
+        d.exec_()
+        if d.result() == QDialog.Accepted:
+            un, pw = d.username(), d.password()
+            self.fetch_news('wsj', 'Wall Street Journal', username=un, password=pw)
+            
+    def fetch_news_barrons(self, checked):
+        d = PasswordDialog(self, 'barrons info dialog', 
+                           '<p>Please enter your username and password for barrons.com<br>Click OK to proceed.')
+        d.exec_()
+        if d.result() == QDialog.Accepted:
+            un, pw = d.username(), d.password()
+            self.fetch_news('barrons', 'Barrons', username=un, password=pw)