mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add WSJ, Barrons and Portfolio.com profiles
This commit is contained in:
parent
f37d8c9dc4
commit
99ceb7a142
@ -32,9 +32,13 @@ from libprs500.ebooks.lrf.web.profiles.newyorkreview import NewYorkReviewOfBooks
|
||||
from libprs500.ebooks.lrf.web.profiles.spiegelde import SpiegelOnline
|
||||
from libprs500.ebooks.lrf.web.profiles.zeitde import ZeitNachrichten
|
||||
from libprs500.ebooks.lrf.web.profiles.faznet import FazNet
|
||||
from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
|
||||
from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
|
||||
from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio
|
||||
|
||||
builtin_profiles = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \
|
||||
SpiegelOnline, ZeitNachrichten, FazNet]
|
||||
builtin_profiles = [NYTimes, BBC, Newsweek, Economist, NewYorkReviewOfBooks, \
|
||||
SpiegelOnline, ZeitNachrichten, FazNet, WallStreetJournal, \
|
||||
Barrons, Portfolio]
|
||||
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
|
||||
|
||||
def option_parser():
|
||||
|
89
src/libprs500/ebooks/lrf/web/profiles/barrons.py
Normal file
89
src/libprs500/ebooks/lrf/web/profiles/barrons.py
Normal file
@ -0,0 +1,89 @@
|
||||
##
|
||||
## web2lrf profile to download articles from Barrons.com
|
||||
## can download subscriber-only content if username and
|
||||
## password are supplied.
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class Barrons(DefaultProfile):
|
||||
|
||||
title = 'Barron\'s'
|
||||
max_recursions = 3
|
||||
max_articles_per_feed = 50
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = False
|
||||
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
|
||||
html2lrf_options = [('--ignore-tables'),('--base-font-size=10')]
|
||||
##delay = 1
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
## Remove anything before the body of the article.
|
||||
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
|
||||
|
||||
## Remove any insets from the body of the article.
|
||||
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
|
||||
|
||||
## Remove any reprint info from the body of the article.
|
||||
(r'<hr size.*?<p', lambda match : '<p'),
|
||||
|
||||
## Remove anything after the end of the article.
|
||||
(r'<!-- article end.*?</body>', lambda match : '</body>'),
|
||||
]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = DefaultProfile.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://commerce.barrons.com/auth/login')
|
||||
br.select_form(name='login_form')
|
||||
br['user'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
## Use the print version of a page when available.
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/article/', '/article_print/')
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
|
||||
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
|
||||
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
|
||||
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
|
||||
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
|
||||
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
|
||||
]
|
||||
|
||||
## Logout of website
|
||||
## NOT CURRENTLY WORKING
|
||||
# def cleanup(self):
|
||||
# try:
|
||||
# self.browser.set_debug_responses(True)
|
||||
# import sys, logging
|
||||
# logger = logging.getLogger("mechanize")
|
||||
# logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||
# logger.setLevel(logging.INFO)
|
||||
|
||||
# res = self.browser.open('http://online.barrons.com/logout')
|
||||
# except:
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
|
||||
|
||||
|
42
src/libprs500/ebooks/lrf/web/profiles/portfolio.py
Normal file
42
src/libprs500/ebooks/lrf/web/profiles/portfolio.py
Normal file
@ -0,0 +1,42 @@
|
||||
##
|
||||
## web2lrf profile to download articles from Portfolio.com
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class Portfolio(DefaultProfile):
|
||||
|
||||
title = 'Portfolio'
|
||||
max_recursions = 0
|
||||
max_articles_per_feed = 50
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = True
|
||||
html2lrf_options = ['--ignore-tables']
|
||||
##delay = 1
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 30
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
|
||||
def get_feeds(self):
|
||||
return [
|
||||
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
|
||||
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
|
||||
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
|
||||
('Executives','http://feeds.portfolio.com/portfolio/executives'),
|
||||
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
|
||||
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
|
||||
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
|
||||
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
|
||||
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
|
||||
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
|
||||
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
|
||||
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
|
||||
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
|
||||
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
|
||||
]
|
||||
|
109
src/libprs500/ebooks/lrf/web/profiles/wsj.py
Normal file
109
src/libprs500/ebooks/lrf/web/profiles/wsj.py
Normal file
@ -0,0 +1,109 @@
|
||||
##
|
||||
## web2lrf profile to download articles from WSJ.com
|
||||
## can download subscriber-only content if username and
|
||||
## password are supplied.
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class WallStreetJournal(DefaultProfile):
|
||||
|
||||
title = 'Wall Street Journal'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 50
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = False
|
||||
html2lrf_options = [('--ignore-tables')]
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
## Remove anything before the body of the article.
|
||||
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
|
||||
|
||||
## Remove any insets from the body of the article.
|
||||
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
|
||||
|
||||
## Remove anything after the end of the article.
|
||||
(r'<!-- article end.*?</body>', lambda match : '</body>'),
|
||||
]
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = DefaultProfile.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://online.wsj.com/login')
|
||||
br.select_form(name='login_form')
|
||||
br['user'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/article/', '/article_print/')
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them or use spaces to put them in the order you desire
|
||||
def get_feeds(self):
|
||||
return [
|
||||
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
|
||||
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
|
||||
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
|
||||
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
|
||||
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
|
||||
# ('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
|
||||
(' Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
|
||||
(' Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
|
||||
(' Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
|
||||
(' Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
|
||||
(' Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
|
||||
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
|
||||
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
|
||||
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
|
||||
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
|
||||
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
|
||||
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
|
||||
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
|
||||
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
|
||||
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
|
||||
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
|
||||
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
|
||||
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
|
||||
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
|
||||
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
|
||||
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
|
||||
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
|
||||
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
|
||||
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
|
||||
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
|
||||
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
|
||||
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
|
||||
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
|
||||
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
|
||||
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
|
||||
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
|
||||
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
|
||||
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
|
||||
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
|
||||
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
|
||||
# ('Personal Journal - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
|
||||
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
|
||||
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
|
||||
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
|
||||
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
|
||||
# ('Weekend & Leisure - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
|
||||
# ('Weekend & Leisure - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
|
||||
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
|
||||
]
|
||||
|
||||
## Logout of website
|
||||
## NOT CURRENTLY WORKING
|
||||
# def cleanup(self):
|
||||
# self.browser.open('http://commerce.wsj.com/auth/postlogout')
|
@ -26,19 +26,25 @@ class NewsMenu(QMenu):
|
||||
|
||||
def __init__(self):
|
||||
QMenu.__init__(self)
|
||||
self.add_menu_item('Barrons', self.fetch_news_barrons)
|
||||
self.add_menu_item('BBC', self.fetch_news_bbc, ':/images/news/bbc.png')
|
||||
self.add_menu_item('Economist', self.fetch_news_economist, ':/images/news/economist.png')
|
||||
self.add_menu_item('Faz.net', self.fetch_news_faznet, ':/images/news/faznet.png')
|
||||
self.add_menu_item('Newsweek', self.fetch_news_newsweek, ':/images/news/newsweek.png')
|
||||
self.add_menu_item('New York Review of Books', self.fetch_news_nyreview, ':/images/book.svg')
|
||||
self.add_menu_item('New York Times', self.fetch_news_nytimes, ':/images/news/nytimes.png')
|
||||
self.add_menu_item('Portfolio.com', self.fetch_news_portfolio)
|
||||
self.add_menu_item('Spiegel Online', self.fetch_news_spiegelde, ':/images/news/spiegelonline.png')
|
||||
self.add_menu_item('Wall Street Journal', self.fetch_news_wsj)
|
||||
self.add_menu_item('Zeit Nachrichten', self.fetch_news_zeitde, ':/images/news/diezeit.png')
|
||||
|
||||
def fetch_news(self, profile, title, username=None, password=None):
|
||||
data = dict(profile=profile, title=title, username=username, password=password)
|
||||
self.emit(SIGNAL('fetch_news(PyQt_PyObject)'), data)
|
||||
|
||||
def fetch_news_portfolio(self, checked):
|
||||
self.fetch_news('portfolio', 'Portfolio.com')
|
||||
|
||||
def fetch_news_spiegelde(self, checked):
|
||||
self.fetch_news('spiegelde', 'Spiegel Online')
|
||||
|
||||
@ -66,4 +72,20 @@ class NewsMenu(QMenu):
|
||||
d.exec_()
|
||||
if d.result() == QDialog.Accepted:
|
||||
un, pw = d.username(), d.password()
|
||||
self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
|
||||
self.fetch_news('nytimes', 'New York Times', username=un, password=pw)
|
||||
|
||||
def fetch_news_wsj(self, checked):
|
||||
d = PasswordDialog(self, 'wsj info dialog',
|
||||
'<p>Please enter your username and password for wsj.com<br>Click OK to proceed.')
|
||||
d.exec_()
|
||||
if d.result() == QDialog.Accepted:
|
||||
un, pw = d.username(), d.password()
|
||||
self.fetch_news('wsj', 'Wall Street Journal', username=un, password=pw)
|
||||
|
||||
def fetch_news_barrons(self, checked):
|
||||
d = PasswordDialog(self, 'barrons info dialog',
|
||||
'<p>Please enter your username and password for barrons.com<br>Click OK to proceed.')
|
||||
d.exec_()
|
||||
if d.result() == QDialog.Accepted:
|
||||
un, pw = d.username(), d.password()
|
||||
self.fetch_news('barrons', 'Barrons', username=un, password=pw)
|
Loading…
x
Reference in New Issue
Block a user