mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
79 lines
3.0 KiB
Plaintext
79 lines
3.0 KiB
Plaintext
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
class Barrons(BasicNewsRecipe):
|
|
|
|
title = 'Barron\'s'
|
|
max_articles_per_feed = 50
|
|
needs_subscription = True
|
|
language = 'en'
|
|
|
|
__author__ = 'Kovid Goyal'
|
|
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
|
|
timefmt = ' [%a, %b %d, %Y]'
|
|
use_embedded_content = False
|
|
no_stylesheets = True
|
|
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
|
|
conversion_options = {'linearize_tables': True}
|
|
##delay = 1
|
|
|
|
# Don't grab articles more than 7 days old
|
|
oldest_article = 7
|
|
use_javascript_to_login = True
|
|
requires_version = (0, 9, 16)
|
|
|
|
keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
|
|
remove_tags = [
|
|
dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
|
dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
|
|
dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
|
|
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
|
]
|
|
|
|
def javascript_login(self, br, username, password):
|
|
br.visit('http://commerce.barrons.com/auth/login')
|
|
f = br.select_form(nr=0)
|
|
f['username'] = username
|
|
f['password'] = password
|
|
br.submit(timeout=120)
|
|
|
|
# Use the print version of a page when available.
|
|
def print_version(self, url):
|
|
main, sep, rest = url.rpartition('?')
|
|
return main + '#text.print'
|
|
|
|
def preprocess_html(self, soup):
|
|
# Remove thumbnail for zoomable images
|
|
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
|
|
img = div.find('img')
|
|
if img is not None:
|
|
img.extract()
|
|
|
|
return soup
|
|
|
|
# Comment out the feeds you don't want retrieved.
|
|
# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
|
|
|
|
def get_feeds(self):
|
|
return [
|
|
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
|
|
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
|
|
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
|
|
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
|
|
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
|
|
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
|
|
]
|
|
|
|
def get_article_url(self, article):
|
|
return article.get('link', None)
|
|
|
|
def get_cover_url(self):
|
|
cover_url = None
|
|
index = 'http://online.barrons.com/home-page'
|
|
soup = self.index_to_soup(index)
|
|
link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
|
|
if link_item:
|
|
cover_url = link_item.img['src']
|
|
return cover_url
|
|
|
|
|