mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Fix #3987 (Barrons.com is not properly parsed anymore)
This commit is contained in:
parent
6e47d77a2a
commit
fb0062dd69
@ -21,7 +21,7 @@ class Barrons(BasicNewsRecipe):
|
||||
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
use_embedded_content = False
|
||||
no_stylesheets = False
|
||||
no_stylesheets = True
|
||||
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
|
||||
conversion_options = {'linearize_tables': True}
|
||||
##delay = 1
|
||||
@ -29,6 +29,20 @@ class Barrons(BasicNewsRecipe):
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
|
||||
extra_css = '''
|
||||
.datestamp{color:#666666; font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
||||
h3{color:#FF0000; font-family:Georgia,"Times New Roman",Times,serif; }
|
||||
h2{font-family:Georgia,"Times New Roman",Times,serif; }
|
||||
h1{ font-family:Georgia,"Times New Roman",Times,serif; }
|
||||
.byline{color:#AAAAAA; font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
|
||||
.subhead{color:#666666; font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
|
||||
.articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;color:#333333;}
|
||||
.insettipUnit{font-size: x-small;}
|
||||
'''
|
||||
remove_tags = [
|
||||
dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
|
||||
dict(name = 'a', attrs ={'class':'insetClose'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
@ -56,10 +70,20 @@ class Barrons(BasicNewsRecipe):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
## Use the print version of a page when available.
|
||||
## Use the print version of a page when available.
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/article/', '/article_print/')
|
||||
main, sep, rest = url.rpartition('?')
|
||||
return main + '#printmode'
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
|
||||
for tag in soup.findAll(name=['ul', 'li']):
|
||||
tag.name = 'div'
|
||||
for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
|
||||
tag.extract()
|
||||
|
||||
return soup
|
||||
|
||||
## Comment out the feeds you don't want retrieved.
|
||||
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
|
||||
@ -74,6 +98,17 @@ class Barrons(BasicNewsRecipe):
|
||||
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
|
||||
]
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://online.barrons.com/home-page'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
|
||||
if link_item:
|
||||
cover_url = link_item.img['src']
|
||||
return cover_url
|
||||
|
||||
|
||||
## Logout of website
|
||||
## NOT CURRENTLY WORKING
|
||||
# def cleanup(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user