Add WSJ, Barrons and Portfolio.com profiles

This commit is contained in:
Kovid Goyal 2007-12-15 22:07:16 +00:00
parent f37d8c9dc4
commit 99ceb7a142
5 changed files with 269 additions and 3 deletions

View File

@ -32,9 +32,13 @@ from libprs500.ebooks.lrf.web.profiles.newyorkreview import NewYorkReviewOfBooks
from libprs500.ebooks.lrf.web.profiles.spiegelde import SpiegelOnline
from libprs500.ebooks.lrf.web.profiles.zeitde import ZeitNachrichten
from libprs500.ebooks.lrf.web.profiles.faznet import FazNet
from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio
builtin_profiles = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \
SpiegelOnline, ZeitNachrichten, FazNet]
builtin_profiles = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \
SpiegelOnline, ZeitNachrichten, FazNet, WallStreetJournal, \
Barrons, Portfolio]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
def option_parser():

View File

@ -0,0 +1,89 @@
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Barrons(DefaultProfile):
title = 'Barron\'s'
max_recursions = 3
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available.
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()

View File

@ -0,0 +1,42 @@
##
## web2lrf profile to download articles from Portfolio.com
##
'''
'''
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Portfolio(DefaultProfile):
title = 'Portfolio'
max_recursions = 0
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = True
html2lrf_options = ['--ignore-tables']
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 30
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
def get_feeds(self):
return [
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
('Executives','http://feeds.portfolio.com/portfolio/executives'),
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
]

View File

@ -0,0 +1,109 @@
##
## web2lrf profile to download articles from WSJ.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class WallStreetJournal(DefaultProfile):
title = 'Wall Street Journal'
max_recursions = 2
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
html2lrf_options = [('--ignore-tables')]
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://online.wsj.com/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
def get_feeds(self):
return [
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
# ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
(' Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
(' Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
(' Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
(' Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
(' Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
# ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
# ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
# ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# self.browser.open('http://commerce.wsj.com/auth/postlogout')

View File

@ -26,19 +26,25 @@ class NewsMenu(QMenu):
def __init__(self):
QMenu.__init__(self)
self.add_menu_item('Barrons', self.fetch_news_barrons)
self.add_menu_item('BBC', self.fetch_news_bbc, ':/images/news/bbc.png')
self.add_menu_item('Economist', self.fetch_news_economist, ':/images/news/economist.png')
self.add_menu_item('Faz.net', self.fetch_news_faznet, ':/images/news/faznet.png')
self.add_menu_item('Newsweek', self.fetch_news_newsweek, ':/images/news/newsweek.png')
self.add_menu_item('New York Review of Books', self.fetch_news_nyreview, ':/images/book.svg')
self.add_menu_item('New York Times', self.fetch_news_nytimes, ':/images/news/nytimes.png')
self.add_menu_item('Portfolio.com', self.fetch_news_portfolio)
self.add_menu_item('Spiegel Online', self.fetch_news_spiegelde, ':/images/news/spiegelonline.png')
self.add_menu_item('Wall Street Journal', self.fetch_news_wsj)
self.add_menu_item('Zeit Nachrichten', self.fetch_news_zeitde, ':/images/news/diezeit.png')
def fetch_news(self, profile, title, username=None, password=None):
data = dict(profile=profile, title=title, username=username, password=password)
self.emit(SIGNAL('fetch_news(PyQt_PyObject)'), data)
def fetch_news_portfolio(self, checked):
self.fetch_news('portfolio', 'Portfolio.com')
def fetch_news_spiegelde(self, checked):
self.fetch_news('spiegelde', 'Spiegel Online')
@ -66,4 +72,20 @@ class NewsMenu(QMenu):
d.exec_()
if d.result() == QDialog.Accepted:
un, pw = d.username(), d.password()
self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
def fetch_news_wsj(self, checked):
d = PasswordDialog(self, 'wsj info dialog',
'<p>Please enter your username and password for wsj.com<br>Click OK to proceed.')
d.exec_()
if d.result() == QDialog.Accepted:
un, pw = d.username(), d.password()
self.fetch_news('wsj', 'Wall Street Journal', username=un, password=pw)
def fetch_news_barrons(self, checked):
d = PasswordDialog(self, 'barrons info dialog',
'<p>Please enter your username and password for barrons.com<br>Click OK to proceed.')
d.exec_()
if d.result() == QDialog.Accepted:
un, pw = d.username(), d.password()
self.fetch_news('barrons', 'Barrons', username=un, password=pw)