Update Barrons

Fixes #1308140 [Private bug](https://bugs.launchpad.net/calibre/+bug/1308140)
This commit is contained in:
Kovid Goyal 2014-04-16 09:06:54 +05:30
parent ac6af50422
commit 112e38cb54

View File

@ -1,129 +1,78 @@
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Barrons(BasicNewsRecipe): class Barrons(BasicNewsRecipe):
title = 'Barron\'s' title = 'Barron\'s'
max_articles_per_feed = 50 max_articles_per_feed = 50
needs_subscription = True needs_subscription = True
language = 'en' language = 'en'
__author__ = 'Kovid Goyal and Sujata Raman' __author__ = 'Kovid Goyal'
description = 'Weekly publication for investors from the publisher of the Wall Street Journal' description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
conversion_options = {'linearize_tables': True} conversion_options = {'linearize_tables': True}
##delay = 1 ##delay = 1
## Don't grab articles more than 7 days old # Don't grab articles more than 7 days old
oldest_article = 7 oldest_article = 7
use_javascript_to_login = True use_javascript_to_login = True
requires_version = (0, 9, 16) requires_version = (0, 9, 16)
extra_css = ''' keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} remove_tags = [
h3{font-family:Georgia,"Times New Roman",Times,serif; } dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
h2{font-family:Georgia,"Times New Roman",Times,serif; } dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
h1{ font-family:Georgia,"Times New Roman",Times,serif; } dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
.byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
.subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;} ]
.articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
.insettipUnit{font-size: x-small;}
'''
remove_tags = [
dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
dict(name = 'a', attrs ={'class':'insetClose'})
]
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in def javascript_login(self, br, username, password):
[ br.visit('http://commerce.barrons.com/auth/login')
## Remove anything before the body of the article. f = br.select_form(nr=0)
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), f['username'] = username
f['password'] = password
br.submit(timeout=120)
## Remove any insets from the body of the article. # Use the print version of a page when available.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), def print_version(self, url):
main, sep, rest = url.rpartition('?')
return main + '#text.print'
## Remove any reprint info from the body of the article. def preprocess_html(self, soup):
(r'<hr size.*?<p', lambda match : '<p'), # Remove thumbnail for zoomable images
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
## Remove anything after the end of the article. return soup
(r'<!-- article end.*?</body>', lambda match : '</body>'),
] # Comment out the feeds you don't want retrieved.
# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
] ]
def javascript_login(self, br, username, password): def get_article_url(self, article):
br.visit('http://commerce.barrons.com/auth/login') return article.get('link', None)
f = br.select_form(nr=0)
f['username'] = username
f['password'] = password
br.submit(timeout=120)
## Use the print version of a page when available.
def print_version(self, url):
main, sep, rest = url.rpartition('?')
return main + '#text.print'
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['ul', 'li']):
tag.name = 'div'
for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
tag.extract()
return soup
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
def get_article_url(self, article):
return article.get('link', None)
def get_cover_url(self):
cover_url = None
index = 'http://online.barrons.com/home-page'
soup = self.index_to_soup(index)
link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
if link_item:
cover_url = link_item.img['src']
return cover_url
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()
def get_cover_url(self):
cover_url = None
index = 'http://online.barrons.com/home-page'
soup = self.index_to_soup(index)
link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
if link_item:
cover_url = link_item.img['src']
return cover_url