Add WSJ, Barrons and Portfolio.com profiles

This commit is contained in:
Kovid Goyal 2007-12-15 22:07:16 +00:00
parent f37d8c9dc4
commit 99ceb7a142
5 changed files with 269 additions and 3 deletions

View File

@ -32,9 +32,13 @@ from libprs500.ebooks.lrf.web.profiles.newyorkreview import NewYorkReviewOfBooks
from libprs500.ebooks.lrf.web.profiles.spiegelde import SpiegelOnline from libprs500.ebooks.lrf.web.profiles.spiegelde import SpiegelOnline
from libprs500.ebooks.lrf.web.profiles.zeitde import ZeitNachrichten from libprs500.ebooks.lrf.web.profiles.zeitde import ZeitNachrichten
from libprs500.ebooks.lrf.web.profiles.faznet import FazNet from libprs500.ebooks.lrf.web.profiles.faznet import FazNet
from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio
builtin_profiles = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \ builtin_profiles = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \
SpiegelOnline, ZeitNachrichten, FazNet] SpiegelOnline, ZeitNachrichten, FazNet, WallStreetJournal, \
Barrons, Portfolio]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
def option_parser(): def option_parser():

View File

@ -0,0 +1,89 @@
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Barrons(DefaultProfile):
title = 'Barron\'s'
max_recursions = 3
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available.
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()

View File

@ -0,0 +1,42 @@
##
## web2lrf profile to download articles from Portfolio.com
##
'''
'''
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Portfolio(DefaultProfile):
title = 'Portfolio'
max_recursions = 0
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = True
html2lrf_options = ['--ignore-tables']
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 30
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
def get_feeds(self):
return [
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
('Executives','http://feeds.portfolio.com/portfolio/executives'),
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
]

View File

@ -0,0 +1,109 @@
##
## web2lrf profile to download articles from WSJ.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class WallStreetJournal(DefaultProfile):
title = 'Wall Street Journal'
max_recursions = 2
max_articles_per_feed = 50
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
html2lrf_options = [('--ignore-tables')]
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://online.wsj.com/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
def get_feeds(self):
return [
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
# ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
(' Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
(' Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
(' Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
(' Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
(' Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
# ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
# ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
# ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# self.browser.open('http://commerce.wsj.com/auth/postlogout')

View File

@ -26,19 +26,25 @@ class NewsMenu(QMenu):
def __init__(self): def __init__(self):
QMenu.__init__(self) QMenu.__init__(self)
self.add_menu_item('Barrons', self.fetch_news_barrons)
self.add_menu_item('BBC', self.fetch_news_bbc, ':/images/news/bbc.png') self.add_menu_item('BBC', self.fetch_news_bbc, ':/images/news/bbc.png')
self.add_menu_item('Economist', self.fetch_news_economist, ':/images/news/economist.png') self.add_menu_item('Economist', self.fetch_news_economist, ':/images/news/economist.png')
self.add_menu_item('Faz.net', self.fetch_news_faznet, ':/images/news/faznet.png') self.add_menu_item('Faz.net', self.fetch_news_faznet, ':/images/news/faznet.png')
self.add_menu_item('Newsweek', self.fetch_news_newsweek, ':/images/news/newsweek.png') self.add_menu_item('Newsweek', self.fetch_news_newsweek, ':/images/news/newsweek.png')
self.add_menu_item('New York Review of Books', self.fetch_news_nyreview, ':/images/book.svg') self.add_menu_item('New York Review of Books', self.fetch_news_nyreview, ':/images/book.svg')
self.add_menu_item('New York Times', self.fetch_news_nytimes, ':/images/news/nytimes.png') self.add_menu_item('New York Times', self.fetch_news_nytimes, ':/images/news/nytimes.png')
self.add_menu_item('Portfolio.com', self.fetch_news_portfolio)
self.add_menu_item('Spiegel Online', self.fetch_news_spiegelde, ':/images/news/spiegelonline.png') self.add_menu_item('Spiegel Online', self.fetch_news_spiegelde, ':/images/news/spiegelonline.png')
self.add_menu_item('Wall Street Journal', self.fetch_news_wsj)
self.add_menu_item('Zeit Nachrichten', self.fetch_news_zeitde, ':/images/news/diezeit.png') self.add_menu_item('Zeit Nachrichten', self.fetch_news_zeitde, ':/images/news/diezeit.png')
def fetch_news(self, profile, title, username=None, password=None): def fetch_news(self, profile, title, username=None, password=None):
data = dict(profile=profile, title=title, username=username, password=password) data = dict(profile=profile, title=title, username=username, password=password)
self.emit(SIGNAL('fetch_news(PyQt_PyObject)'), data) self.emit(SIGNAL('fetch_news(PyQt_PyObject)'), data)
def fetch_news_portfolio(self, checked):
self.fetch_news('portfolio', 'Portfolio.com')
def fetch_news_spiegelde(self, checked): def fetch_news_spiegelde(self, checked):
self.fetch_news('spiegelde', 'Spiegel Online') self.fetch_news('spiegelde', 'Spiegel Online')
@ -67,3 +73,19 @@ class NewsMenu(QMenu):
if d.result() == QDialog.Accepted: if d.result() == QDialog.Accepted:
un, pw = d.username(), d.password() un, pw = d.username(), d.password()
self.fetch_news('nytimes', 'New York Times', username=un, password=pw) self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
def fetch_news_wsj(self, checked):
d = PasswordDialog(self, 'wsj info dialog',
'<p>Please enter your username and password for wsj.com<br>Click OK to proceed.')
d.exec_()
if d.result() == QDialog.Accepted:
un, pw = d.username(), d.password()
self.fetch_news('wsj', 'Wall Street Journal', username=un, password=pw)
def fetch_news_barrons(self, checked):
d = PasswordDialog(self, 'barrons info dialog',
'<p>Please enter your username and password for barrons.com<br>Click OK to proceed.')
d.exec_()
if d.result() == QDialog.Accepted:
un, pw = d.username(), d.password()
self.fetch_news('barrons', 'Barrons', username=un, password=pw)