Add WSJ, Barrons and Portfolio.com profiles

2025-07-09 03:04:10 -04:00 · 2007-12-15 22:07:16 +00:00 · 2007-12-15 22:07:16 +00:00 · 99ceb7a142
commit 99ceb7a142
parent f37d8c9dc4
5 changed files with 269 additions and 3 deletions
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -32,9 +32,13 @@ from libprs500.ebooks.lrf.web.profiles.newyorkreview import NewYorkReviewOfBooks
 from libprs500.ebooks.lrf.web.profiles.spiegelde import SpiegelOnline
 from libprs500.ebooks.lrf.web.profiles.zeitde import ZeitNachrichten
 from libprs500.ebooks.lrf.web.profiles.faznet import FazNet
 from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
 from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
 from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio  
 builtin_profiles   = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks,   \
-                      SpiegelOnline, ZeitNachrichten, FazNet]
+                      SpiegelOnline, ZeitNachrichten, FazNet, WallStreetJournal, \
                      Barrons, Portfolio]
 available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] 
 def option_parser():
--- a/src/libprs500/ebooks/lrf/web/profiles/barrons.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/barrons.py
@ -0,0 +1,89 @@
 ##
 ##    web2lrf profile to download articles from Barrons.com 
 ##    can download subscriber-only content if username and  
 ##    password are supplied.
 ##
 ''' 
 ''' 
 import re 
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
 class Barrons(DefaultProfile): 
        title = 'Barron\'s' 
        max_recursions = 3
        max_articles_per_feed = 50
        timefmt  = ' [%a, %b %d, %Y]' 
        html_description = True 
        no_stylesheets = False
        match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
        html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
        ##delay = 1
        ## Don't grab articles more than 7 days old 
        oldest_article = 7 
        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
                [ 
                ## Remove anything before the body of the article. 
                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
                ## Remove any insets from the body of the article. 
                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
                ## Remove any reprint info from the body of the article. 
                (r'<hr size.*?<p', lambda match : '<p'), 
                ## Remove anything after the end of the article. 
                (r'<!-- article end.*?</body>', lambda match : '</body>'), 
                ] 
        ] 
        def get_browser(self): 
            br = DefaultProfile.get_browser() 
            if self.username is not None and self.password is not None: 
                br.open('http://commerce.barrons.com/auth/login') 
                br.select_form(name='login_form') 
                br['user']   = self.username 
                br['password'] = self.password 
                br.submit() 
            return br 
 ## Use the print version of a page when available. 
        def print_version(self, url): 
                return url.replace('/article/', '/article_print/') 
 ## Comment out the feeds you don't want retrieved. 
 ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire 
        def get_feeds(self): 
                return  [ 
                ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), 
                ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), 
                ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), 
                ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), 
                ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), 
                ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), 
                ]
        ## Logout of website
        ## NOT CURRENTLY WORKING
        # def cleanup(self):
            # try:
                # self.browser.set_debug_responses(True)
                # import sys, logging
                # logger = logging.getLogger("mechanize")
                # logger.addHandler(logging.StreamHandler(sys.stdout))
                # logger.setLevel(logging.INFO)
                # res = self.browser.open('http://online.barrons.com/logout')
            # except:
                # import traceback
                # traceback.print_exc()
--- a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
@ -0,0 +1,42 @@
 ##
 ##    web2lrf profile to download articles from Portfolio.com 
 ##
 ''' 
 ''' 
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
 class Portfolio(DefaultProfile): 
        title = 'Portfolio' 
        max_recursions = 0
        max_articles_per_feed = 50
        timefmt  = ' [%a, %b %d, %Y]' 
        html_description = True 
        no_stylesheets = True
        html2lrf_options = ['--ignore-tables']
        ##delay = 1
        ## Don't grab articles more than 7 days old 
        oldest_article = 30
 ## Comment out the feeds you don't want retrieved. 
 ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire 
        def get_feeds(self): 
                return  [ 
                ('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'), 
                ('Careers', 'http://feeds.portfolio.com/portfolio/careers'), 
                ('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'), 
                ('Executives','http://feeds.portfolio.com/portfolio/executives'), 
                ('News and Markets', 'http://feeds.portfolio.com/portfolio/news'), 
                ('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'), 
                ('Capital', 'http://feeds.portfolio.com/portfolio/capital'), 
                ('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'), 
                ('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'), 
                ('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'), 
                ('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'), 
                ('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'), 
                ('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'), 
                ('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'), 
                ]
--- a/src/libprs500/ebooks/lrf/web/profiles/wsj.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/wsj.py
@ -0,0 +1,109 @@
 ##
 ##    web2lrf profile to download articles from WSJ.com 
 ##    can download subscriber-only content if username and  
 ##    password are supplied.
 ##
 ''' 
 ''' 
 import re 
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
 class WallStreetJournal(DefaultProfile): 
        title = 'Wall Street Journal' 
        max_recursions = 2 
        max_articles_per_feed = 50
        timefmt  = ' [%a, %b %d, %Y]' 
        html_description = True 
        no_stylesheets = False
        html2lrf_options = [('--ignore-tables')]
        ## Don't grab articles more than 7 days old 
        oldest_article = 7 
        preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
                [ 
                ## Remove anything before the body of the article. 
                (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
                ## Remove any insets from the body of the article. 
                (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
                ## Remove anything after the end of the article. 
                (r'<!-- article end.*?</body>', lambda match : '</body>'), 
                ] 
        ] 
        def get_browser(self): 
            br = DefaultProfile.get_browser() 
            if self.username is not None and self.password is not None: 
                br.open('http://online.wsj.com/login') 
                br.select_form(name='login_form') 
                br['user']   = self.username 
                br['password'] = self.password 
                br.submit() 
            return br 
        def print_version(self, url): 
                return url.replace('/article/', '/article_print/') 
 ## Comment out the feeds you don't want retrieved. 
 ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them  or use spaces to put them in the order you desire 
        def get_feeds(self): 
                return  [ 
                #('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'), 
                #('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'), 
                #('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'), 
                (' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'), 
                (' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'), 
                # ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'), 
                ('  Today\'s Newspaper -  Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'), 
                ('  Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'), 
                ('  Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'), 
                ('  Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'), 
                ('  Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'), 
                ('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'), 
                ('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'), 
                ('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'), 
                ('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'), 
                ('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'), 
                ('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'), 
                ('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'), 
                ('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'), 
                ('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'), 
                ('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'), 
                ('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'), 
                ('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'), 
                ('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'), 
                ('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'), 
                ('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'), 
                ('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'), 
                ('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'), 
                ('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'), 
                ('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'), 
                ('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'), 
                ('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'), 
                ('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'), 
                ('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'), 
                ('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'), 
                ('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'), 
                ('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'), 
                ('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'), 
                ('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'), 
                ('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'), 
                # ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'), 
                ('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'), 
                ('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'), 
                ('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'), 
                ('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'), 
                # ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'), 
                # ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'), 
                ('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'), 
                ]
 ## Logout of website
 ## NOT CURRENTLY WORKING
        # def cleanup(self): 
            # self.browser.open('http://commerce.wsj.com/auth/postlogout') 
--- a/src/libprs500/gui2/news.py
+++ b/src/libprs500/gui2/news.py
@ -26,19 +26,25 @@ class NewsMenu(QMenu):
    def __init__(self):
        QMenu.__init__(self)
        self.add_menu_item('Barrons', self.fetch_news_barrons)
        self.add_menu_item('BBC', self.fetch_news_bbc, ':/images/news/bbc.png')
        self.add_menu_item('Economist', self.fetch_news_economist, ':/images/news/economist.png')
        self.add_menu_item('Faz.net', self.fetch_news_faznet, ':/images/news/faznet.png')
        self.add_menu_item('Newsweek', self.fetch_news_newsweek, ':/images/news/newsweek.png')
        self.add_menu_item('New York Review of Books', self.fetch_news_nyreview, ':/images/book.svg')
        self.add_menu_item('New York Times', self.fetch_news_nytimes, ':/images/news/nytimes.png')
        self.add_menu_item('Portfolio.com', self.fetch_news_portfolio)
        self.add_menu_item('Spiegel Online', self.fetch_news_spiegelde, ':/images/news/spiegelonline.png')
        self.add_menu_item('Wall Street Journal', self.fetch_news_wsj)
        self.add_menu_item('Zeit Nachrichten', self.fetch_news_zeitde, ':/images/news/diezeit.png')
    def fetch_news(self, profile, title, username=None, password=None):
        data = dict(profile=profile, title=title, username=username, password=password)
        self.emit(SIGNAL('fetch_news(PyQt_PyObject)'), data)
    def fetch_news_portfolio(self, checked):
        self.fetch_news('portfolio', 'Portfolio.com')
    def fetch_news_spiegelde(self, checked):
        self.fetch_news('spiegelde', 'Spiegel Online')
@ -67,3 +73,19 @@ class NewsMenu(QMenu):
        if d.result() == QDialog.Accepted:
            un, pw = d.username(), d.password()
            self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
    def fetch_news_wsj(self, checked):
        d = PasswordDialog(self, 'wsj info dialog', 
                           '<p>Please enter your username and password for wsj.com<br>Click OK to proceed.')
        d.exec_()
        if d.result() == QDialog.Accepted:
            un, pw = d.username(), d.password()
            self.fetch_news('wsj', 'Wall Street Journal', username=un, password=pw)
    def fetch_news_barrons(self, checked):
        d = PasswordDialog(self, 'barrons info dialog', 
                           '<p>Please enter your username and password for barrons.com<br>Click OK to proceed.')
        d.exec_()
        if d.result() == QDialog.Accepted:
            un, pw = d.username(), d.password()
            self.fetch_news('barrons', 'Barrons', username=un, password=pw)