mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
parent
4768f29bf2
commit
bec396158a
@ -8,17 +8,10 @@ import copy
|
|||||||
|
|
||||||
# http://online.wsj.com/page/us_in_todays_paper.html
|
# http://online.wsj.com/page/us_in_todays_paper.html
|
||||||
|
|
||||||
def filter_classes(x):
|
|
||||||
if not x:
|
|
||||||
return False
|
|
||||||
bad_classes = {'articleInsetPoll', 'trendingNow', 'sTools', 'printSummary', 'mostPopular', 'relatedCollection'}
|
|
||||||
classes = frozenset(x.split())
|
|
||||||
return len(bad_classes.intersection(classes)) > 0
|
|
||||||
|
|
||||||
class WallStreetJournal(BasicNewsRecipe):
|
class WallStreetJournal(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The Wall Street Journal'
|
title = 'The Wall Street Journal'
|
||||||
__author__ = 'Kovid Goyal, Sujata Raman, and Joshua Oster-Morris'
|
__author__ = 'Kovid Goyal and Joshua Oster-Morris'
|
||||||
description = 'News and current affairs'
|
description = 'News and current affairs'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
@ -39,23 +32,16 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
|
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
|
||||||
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
|
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
|
||||||
.paperLocation{color:#666666; font-size:xx-small}'''
|
.paperLocation{color:#666666; font-size:xx-small}'''
|
||||||
|
keep_only_tags = [
|
||||||
remove_tags_before = dict(name='h1')
|
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
||||||
|
dict(name='span', itemprop='author', rel='author'),
|
||||||
|
dict(name='article', id='articleBody'),
|
||||||
|
dict(name='div', id='article_story_body'),
|
||||||
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(id=["articleTabs_tab_article",
|
dict(attrs={'class':['insetButton', 'insettipBox']}),
|
||||||
"articleTabs_tab_comments", 'msnLinkback', 'yahooLinkback',
|
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||||
'articleTabs_panel_comments', 'footer', 'emailThisScrim', 'emailConfScrim', 'emailErrorScrim',
|
]
|
||||||
"articleTabs_tab_interactive", "articleTabs_tab_video",
|
|
||||||
"articleTabs_tab_map", "articleTabs_tab_slideshow",
|
|
||||||
"articleTabs_tab_quotes", "articleTabs_tab_document",
|
|
||||||
"printModeAd", "aFbLikeAuth", "videoModule",
|
|
||||||
"mostRecommendations", "topDiscussions"]),
|
|
||||||
{'class':['footer_columns','hidden', 'network','insetCol3wide','interactive','video','slideshow','map','insettip',
|
|
||||||
'insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
|
|
||||||
dict(rel='shortcut icon'),
|
|
||||||
{'class':filter_classes},
|
|
||||||
]
|
|
||||||
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
|
|
||||||
|
|
||||||
use_javascript_to_login = True
|
use_javascript_to_login = True
|
||||||
|
|
||||||
@ -72,15 +58,12 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
if picdiv is not None:
|
if picdiv is not None:
|
||||||
self.add_toc_thumbnail(article,picdiv['src'])
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def preprocess_html(self, soup):
|
||||||
for tag in soup.findAll(name=['table', 'tr', 'td']):
|
# Remove thumbnail for zoomable images
|
||||||
tag.name = 'div'
|
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
|
||||||
|
img = div.find('img')
|
||||||
for tag in soup.findAll('div', dict(id=[
|
if img is not None:
|
||||||
"articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3",
|
img.extract()
|
||||||
"articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6",
|
|
||||||
"articleThumbnail_7"])):
|
|
||||||
tag.extract()
|
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -33,21 +33,16 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
|
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
|
||||||
.paperLocation{color:#666666; font-size:xx-small}'''
|
.paperLocation{color:#666666; font-size:xx-small}'''
|
||||||
|
|
||||||
remove_tags_before = dict(name='h1')
|
keep_only_tags = [
|
||||||
|
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
|
||||||
|
dict(name='span', itemprop='author', rel='author'),
|
||||||
|
dict(name='article', id='articleBody'),
|
||||||
|
dict(name='div', id='article_story_body'),
|
||||||
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(id=["articleTabs_tab_article",
|
dict(attrs={'class':['insetButton', 'insettipBox']}),
|
||||||
"articleTabs_tab_comments",
|
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
|
||||||
"articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow",
|
]
|
||||||
"articleTabs_tab_quotes"]),
|
|
||||||
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
|
|
||||||
dict(name='div', attrs={'data-flash-settings':True}),
|
|
||||||
{'class':['insetContent embedType-interactive insetCol3wide','insetCol6wide','insettipUnit']},
|
|
||||||
dict(rel='shortcut icon'),
|
|
||||||
{'class':lambda x: x and 'sTools' in x},
|
|
||||||
{'class':lambda x: x and 'printSummary' in x},
|
|
||||||
{'class':lambda x: x and 'mostPopular' in x},
|
|
||||||
]
|
|
||||||
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
|
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
@ -55,12 +50,12 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
if picdiv is not None:
|
if picdiv is not None:
|
||||||
self.add_toc_thumbnail(article,picdiv['src'])
|
self.add_toc_thumbnail(article,picdiv['src'])
|
||||||
|
|
||||||
def postprocess_html(self, soup, first):
|
def preprocess_html(self, soup):
|
||||||
for tag in soup.findAll(name=['table', 'tr', 'td']):
|
# Remove thumbnail for zoomable images
|
||||||
tag.name = 'div'
|
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
|
||||||
|
img = div.find('img')
|
||||||
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
|
if img is not None:
|
||||||
tag.extract()
|
img.extract()
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
@ -69,7 +64,6 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
href = 'http://online.wsj.com' + href
|
href = 'http://online.wsj.com' + href
|
||||||
return href
|
return href
|
||||||
|
|
||||||
|
|
||||||
def wsj_get_index(self):
|
def wsj_get_index(self):
|
||||||
return self.index_to_soup('http://online.wsj.com/itp')
|
return self.index_to_soup('http://online.wsj.com/itp')
|
||||||
|
|
||||||
@ -83,7 +77,7 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
except:
|
except:
|
||||||
articles = []
|
articles = []
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -99,16 +93,16 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
|
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
|
||||||
pageone = a['href'].endswith('pageone')
|
pageone = a['href'].endswith('pageone')
|
||||||
if pageone:
|
if pageone:
|
||||||
title = 'Front Section'
|
title = 'Front Section'
|
||||||
url = self.abs_wsj_url(a['href'])
|
url = self.abs_wsj_url(a['href'])
|
||||||
feeds = self.wsj_add_feed(feeds,title,url)
|
feeds = self.wsj_add_feed(feeds,title,url)
|
||||||
title = 'What''s News'
|
title = 'What''s News'
|
||||||
url = url.replace('pageone','whatsnews')
|
url = url.replace('pageone','whatsnews')
|
||||||
feeds = self.wsj_add_feed(feeds,title,url)
|
feeds = self.wsj_add_feed(feeds,title,url)
|
||||||
else:
|
else:
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = self.abs_wsj_url(a['href'])
|
url = self.abs_wsj_url(a['href'])
|
||||||
feeds = self.wsj_add_feed(feeds,title,url)
|
feeds = self.wsj_add_feed(feeds,title,url)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def wsj_find_wn_articles(self, url):
|
def wsj_find_wn_articles(self, url):
|
||||||
@ -117,21 +111,21 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
|
|
||||||
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
|
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
|
||||||
if whats_news is not None:
|
if whats_news is not None:
|
||||||
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
|
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
|
||||||
container = a.findParent(['p'])
|
container = a.findParent(['p'])
|
||||||
meta = a.find(attrs={'class':'meta_sectionName'})
|
meta = a.find(attrs={'class':'meta_sectionName'})
|
||||||
if meta is not None:
|
if meta is not None:
|
||||||
meta.extract()
|
meta.extract()
|
||||||
title = self.tag_to_string(a).strip()
|
title = self.tag_to_string(a).strip()
|
||||||
url = a['href']
|
url = a['href']
|
||||||
desc = ''
|
desc = ''
|
||||||
if container is not None:
|
if container is not None:
|
||||||
desc = self.tag_to_string(container)
|
desc = self.tag_to_string(container)
|
||||||
|
|
||||||
articles.append({'title':title, 'url':url,
|
articles.append({'title':title, 'url':url,
|
||||||
'description':desc, 'date':''})
|
'description':desc, 'date':''})
|
||||||
|
|
||||||
self.log('\tFound WN article:', title)
|
self.log('\tFound WN article:', title)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@ -140,18 +134,18 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
|
|
||||||
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
|
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
|
||||||
if whats_news is not None:
|
if whats_news is not None:
|
||||||
whats_news.extract()
|
whats_news.extract()
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
|
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
|
||||||
if flavorarea is not None:
|
if flavorarea is not None:
|
||||||
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
|
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
|
||||||
if flavorstory is not None:
|
if flavorstory is not None:
|
||||||
flavorstory['class'] = 'mjLinkItem'
|
flavorstory['class'] = 'mjLinkItem'
|
||||||
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
|
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
|
||||||
if metapage is not None:
|
if metapage is not None:
|
||||||
flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page
|
flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page
|
||||||
|
|
||||||
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
|
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
|
||||||
container = a.findParent(['li', 'div'])
|
container = a.findParent(['li', 'div'])
|
||||||
@ -176,5 +170,3 @@ class WallStreetJournal(BasicNewsRecipe):
|
|||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user