mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Barrons
Fixes #1308140 [Private bug](https://bugs.launchpad.net/calibre/+bug/1308140)
This commit is contained in:
parent
ac6af50422
commit
112e38cb54
@ -1,13 +1,3 @@
|
||||
##
|
||||
## web2lrf profile to download articles from Barrons.com
|
||||
## can download subscriber-only content if username and
|
||||
## password are supplied.
|
||||
##
|
||||
'''
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Barrons(BasicNewsRecipe):
|
||||
@ -17,7 +7,7 @@ class Barrons(BasicNewsRecipe):
|
||||
needs_subscription = True
|
||||
language = 'en'
|
||||
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
__author__ = 'Kovid Goyal'
|
||||
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
use_embedded_content = False
|
||||
@ -26,40 +16,17 @@ class Barrons(BasicNewsRecipe):
|
||||
conversion_options = {'linearize_tables': True}
|
||||
##delay = 1
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
# Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
use_javascript_to_login = True
|
||||
requires_version = (0, 9, 16)
|
||||
|
||||
extra_css = '''
|
||||
.datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
||||
h3{font-family:Georgia,"Times New Roman",Times,serif; }
|
||||
h2{font-family:Georgia,"Times New Roman",Times,serif; }
|
||||
h1{ font-family:Georgia,"Times New Roman",Times,serif; }
|
||||
.byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
||||
.subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
|
||||
.articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
|
||||
.insettipUnit{font-size: x-small;}
|
||||
'''
|
||||
keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})]
|
||||
remove_tags = [
|
||||
dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
||||
dict(name = 'a', attrs ={'class':'insetClose'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
## Remove anything before the body of the article.
|
||||
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
|
||||
|
||||
## Remove any insets from the body of the article.
|
||||
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
|
||||
|
||||
## Remove any reprint info from the body of the article.
|
||||
(r'<hr size.*?<p', lambda match : '<p'),
|
||||
|
||||
## Remove anything after the end of the article.
|
||||
(r'<!-- article end.*?</body>', lambda match : '</body>'),
|
||||
]
|
||||
dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
||||
dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}),
|
||||
dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
|
||||
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||
]
|
||||
|
||||
def javascript_login(self, br, username, password):
|
||||
@ -69,22 +36,22 @@ class Barrons(BasicNewsRecipe):
|
||||
f['password'] = password
|
||||
br.submit(timeout=120)
|
||||
|
||||
## Use the print version of a page when available.
|
||||
# Use the print version of a page when available.
|
||||
def print_version(self, url):
|
||||
main, sep, rest = url.rpartition('?')
|
||||
return main + '#text.print'
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
|
||||
for tag in soup.findAll(name=['ul', 'li']):
|
||||
tag.name = 'div'
|
||||
for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
|
||||
tag.extract()
|
||||
def preprocess_html(self, soup):
|
||||
# Remove thumbnail for zoomable images
|
||||
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
|
||||
img = div.find('img')
|
||||
if img is not None:
|
||||
img.extract()
|
||||
|
||||
return soup
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
|
||||
# Comment out the feeds you don't want retrieved.
|
||||
# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
|
||||
|
||||
def get_feeds(self):
|
||||
return [
|
||||
@ -99,7 +66,6 @@ class Barrons(BasicNewsRecipe):
|
||||
def get_article_url(self, article):
|
||||
return article.get('link', None)
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://online.barrons.com/home-page'
|
||||
@ -110,20 +76,3 @@ class Barrons(BasicNewsRecipe):
|
||||
return cover_url
|
||||
|
||||
|
||||
## Logout of website
|
||||
## NOT CURRENTLY WORKING
|
||||
# def cleanup(self):
|
||||
# try:
|
||||
# self.browser.set_debug_responses(True)
|
||||
# import sys, logging
|
||||
# logger = logging.getLogger("mechanize")
|
||||
# logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||
# logger.setLevel(logging.INFO)
|
||||
|
||||
# res = self.browser.open('http://online.barrons.com/logout')
|
||||
# except:
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user