Update Wall Street Journal

Fixes #1395546 [Private bug](https://bugs.launchpad.net/calibre/+bug/1395546)
This commit is contained in:
Kovid Goyal 2014-11-25 21:24:51 +05:30
parent 37439fecf6
commit 4cd960d9d9
2 changed files with 293 additions and 313 deletions

View File

@ -1,77 +1,136 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.web.feeds.news import BasicNewsRecipe
import copy, re
try:
from calibre.web.feeds.jsnews import CSSSelect
except ImportError:
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe):
class WSJ(JavascriptRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal and Joshua Oster-Morris'
__author__ = 'Kovid Goyal'
description = 'News and current affairs'
needs_subscription = True
language = 'en'
compress_news_images = True
compress_news_images_auto_size = 5
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim']
needs_subscription = True
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
dict(name='span', itemprop='author', rel='author'),
dict(name='article', id=['article-contents', 'articleBody']),
dict(name='div', id='article_story_body'),
dict(name='div', attrs={'class':'snippet-ad-login'}),
]
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
preprocess_regexps = [
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
]
keep_only_tags = (
'h1', # 'h2.subhead', 'h2.subHed.deck',
'span[itemprop=author][rel=author]',
'article#article-contents', 'article#articleBody',
'div#article_story_body',
# Parallax formatting
'div#ncTitleArea', 'section.nc-exp-artbody',
# Error conditions, login required and page not found
'div#snippet-ad-login', 'div.errorNotFound',
)
use_javascript_to_login = True
remove_tags = (
'.insetButton', '.insettipBox', '.author-info', '.media-object-video',
'.article_tools', 'span[data-country-code][data-ticker-code]',
'div.nc-exp-artmeta',
)
def javascript_login(self, br, username, password):
br.visit('https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120)
def do_login(self, br, username, password):
br.visit(
'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) # noqa
f = br.select_form(nr=0)
f['username'] = username
f['password'] = password
br.submit(timeout=120)
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img', src=True)
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_stage2(self, article, browser, url, recursion_level):
# Slideshow and expandable images need to be processed here to
# set the src attribute correctly
found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
found += 1
for img in browser.css_select('img[data-enlarge]', all=True):
img.setAttribute('src', img.attribute('data-enlarge'))
found += 1
if found:
self.log.debug('Found %d dynamic images in:' % found, url)
def preprocess_html(self, soup):
# Remove thumbnail for zoomable images
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
# Use large images
for img in soup.findAll('img', attrs={'data-enlarge':True}):
img['src'] = img['data-enlarge']
def get_publication_data(self, browser):
return self.get_wsj_index(browser)
return soup
def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def wsj_get_index(self):
return self.index_to_soup('http://online.wsj.com/itp')
def wsj_find_articles(self, url):
root = self.index_to_soup(url)
def wsj_add_feed(self,feeds,title,url):
for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x)
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
meta = meta[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
if meta:
title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url)
articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
if whats_news:
for a in CSSSelect('a[href]')(whats_news[-1]):
if '/articles/' not in a.get('href', ''):
continue
container = a.xpath('ancestor::p')
for meta in CSSSelect('.meta_sectionName')(a):
meta.getparent().remove(meta)
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
desc = self.tag_to_string(container[0])
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title)
try:
if url.endswith('whatsnews'):
@ -82,129 +141,47 @@ class WallStreetJournal(BasicNewsRecipe):
articles = []
if articles:
feeds.append((title, articles))
return feeds
def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def get_wsj_index(self, browser):
# return self.test_wsj_index()
ans = {}
root = self.index_to_soup('http://online.wsj.com/itp')
for span in CSSSelect('span.date-date')(root):
if span.text:
self.timefmt = span.text
break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href')
if href:
break
ans['cover'] = browser.download_file(href)
def parse_index(self):
soup = self.wsj_get_index()
date = soup.find('span', attrs={'class':'date-date'})
if date is not None:
self.timefmt = ' [%s]'%self.tag_to_string(date)
cov = soup.find('div', attrs={'class':lambda x: x and 'itpSectionHeaderPdf' in x.split()})
if cov is not None:
a = cov.find('a', href=True)
if a is not None:
self.cover_url = a['href']
feeds = []
div = soup.find('div', attrs={'class':'itpHeader'})
div = div.find('ul', attrs={'class':'tab'})
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
pageone = a['href'].endswith('pageone')
feeds = ans['index'] = []
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''):
continue
pageone = a.get('href').endswith('pageone')
if pageone:
title = 'Front Section'
url = self.abs_wsj_url(a['href'])
feeds = self.wsj_add_feed(feeds,title,url)
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
title = "What's News"
url = url.replace('pageone','whatsnews')
feeds = self.wsj_add_feed(feeds,title,url)
url = url.replace('pageone', 'whatsnews')
self.wsj_add_feed(feeds, title, url)
else:
title = self.tag_to_string(a)
url = self.abs_wsj_url(a['href'])
feeds = self.wsj_add_feed(feeds,title,url)
for li in soup.findAll('li', attrs={'class':'ahed_listitem'}):
h2 = li.find('h2')
if h2 is None:
continue
a = h2.find('a', href=True)
if a is None:
continue
url = a['href']
title = self.tag_to_string(a)
p = h2.findNextSibling('p')
if p is not None:
desc = self.tag_to_string(p)
else:
desc = ''
if feeds:
feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
return feeds
def wsj_find_wn_articles(self, url):
soup = self.index_to_soup(url)
articles = []
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
container = a.findParent(['p'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
title = self.tag_to_string(a).strip()
url = a['href']
desc = ''
if container is not None:
desc = self.tag_to_string(container)
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_articles(self, url):
soup = self.index_to_soup(url)
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
whats_news.extract()
articles = []
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
if flavorarea is not None:
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
if flavorstory is not None:
flavorstory['class'] = 'mjLinkItem'
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
if metapage is not None:
flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
meta = self.tag_to_string(meta).strip()
if meta:
title = self.tag_to_string(a).strip() + ' [%s]'%meta
else:
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a['href'])
desc = ''
for p in container.findAll('p'):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
return ans
def test_wsj_index(self):
return {'index': [
('Testing', [
{'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
{'title': 'Article Two',
'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
{'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]),
]}

View File

@ -1,71 +1,140 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.web.feeds.news import BasicNewsRecipe
import copy, re
try:
from calibre.web.feeds.jsnews import CSSSelect
except ImportError:
def CSSSelect(expr):
from cssselect import HTMLTranslator
from lxml.etree import XPath
return XPath(HTMLTranslator().css_to_xpath(expr))
class WallStreetJournal(BasicNewsRecipe):
class WSJ(JavascriptRecipe):
title = 'Wall Street Journal (free)'
__author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17'
__author__ = 'Kovid Goyal'
description = '''News and current affairs. This recipe only fetches complete
versions of the articles that are available free on the wsj.com website.
To get the rest of the articles, subscribe to the WSJ and use the other WSJ
recipe.'''
language = 'en'
cover_url = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
compress_news_images = True
compress_news_images_auto_size = 5
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim']
needs_subscription = True
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
dict(name='span', itemprop='author', rel='author'),
dict(name='article', id=['article-contents', 'articleBody']),
dict(name='div', id='article_story_body'),
dict(name='div', attrs={'class':'snippet-ad-login'}),
]
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
preprocess_regexps = [
(re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
]
keep_only_tags = (
'h1', # 'h2.subhead', 'h2.subHed.deck',
'span[itemprop=author][rel=author]',
'article#article-contents', 'article#articleBody',
'div#article_story_body',
# Parallax formatting
'div#ncTitleArea', 'section.nc-exp-artbody',
# Error conditions, login required and page not found
'div#snippet-ad-login', 'div.errorNotFound',
)
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img', src=True)
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
remove_tags = (
'.insetButton', '.insettipBox', '.author-info', '.media-object-video',
'.article_tools', 'span[data-country-code][data-ticker-code]',
'div.nc-exp-artmeta',
)
def preprocess_html(self, soup):
# Remove thumbnail for zoomable images
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
# Use large images
for img in soup.findAll('img', attrs={'data-enlarge':True}):
img['src'] = img['data-enlarge']
def do_login(self, br, username, password):
br.visit(
'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) # noqa
f = br.select_form(nr=0)
f['username'] = username
f['password'] = password
br.submit(timeout=120)
return soup
def preprocess_stage2(self, article, browser, url, recursion_level):
# Slideshow and expandable images need to be processed here to
# set the src attribute correctly
found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
found += 1
for img in browser.css_select('img[data-enlarge]', all=True):
img.setAttribute('src', img.attribute('data-enlarge'))
found += 1
if found:
self.log.debug('Found %d dynamic images in:' % found, url)
def get_publication_data(self, browser):
return self.get_wsj_index(browser)
def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def wsj_get_index(self):
return self.index_to_soup('http://online.wsj.com/itp')
def wsj_find_articles(self, url):
root = self.index_to_soup(url)
def wsj_add_feed(self,feeds,title,url):
for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x)
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
meta = meta[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
if meta:
title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url)
articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
if whats_news:
for a in CSSSelect('a[href]')(whats_news[-1]):
if '/articles/' not in a.get('href', ''):
continue
container = a.xpath('ancestor::p')
for meta in CSSSelect('.meta_sectionName')(a):
meta.getparent().remove(meta)
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
desc = self.tag_to_string(container[0])
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title)
try:
if url.endswith('whatsnews'):
@ -76,113 +145,47 @@ class WallStreetJournal(BasicNewsRecipe):
articles = []
if articles:
feeds.append((title, articles))
return feeds
def parse_index(self):
soup = self.wsj_get_index()
def get_wsj_index(self, browser):
# return self.test_wsj_index()
ans = {}
root = self.index_to_soup('http://online.wsj.com/itp')
for span in CSSSelect('span.date-date')(root):
if span.text:
self.timefmt = span.text
break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href')
if href:
break
ans['cover'] = browser.download_file(href)
date = soup.find('span', attrs={'class':'date-date'})
if date is not None:
self.timefmt = ' [%s]'%self.tag_to_string(date)
feeds = []
div = soup.find('div', attrs={'class':'itpHeader'})
div = div.find('ul', attrs={'class':'tab'})
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
pageone = a['href'].endswith('pageone')
feeds = ans['index'] = []
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''):
continue
pageone = a.get('href').endswith('pageone')
if pageone:
title = 'Front Section'
url = self.abs_wsj_url(a['href'])
feeds = self.wsj_add_feed(feeds,title,url)
title = 'What''s News'
url = url.replace('pageone','whatsnews')
feeds = self.wsj_add_feed(feeds,title,url)
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
title = "What's News"
url = url.replace('pageone', 'whatsnews')
self.wsj_add_feed(feeds, title, url)
else:
title = self.tag_to_string(a)
url = self.abs_wsj_url(a['href'])
feeds = self.wsj_add_feed(feeds,title,url)
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
return ans
for li in soup.findAll('li', attrs={'class':'ahed_listitem'}):
h2 = li.find('h2')
if h2 is None:
continue
a = h2.find('a', href=True)
if a is None:
continue
url = a['href']
title = self.tag_to_string(a)
p = h2.findNextSibling('p')
if p is not None:
desc = self.tag_to_string(p)
else:
desc = ''
if feeds:
feeds[0][1].append({'title':title, 'url':url, 'description':desc, 'date':''})
feeds = [x for x in feeds if x[0] == 'Opinion']
return feeds
def wsj_find_wn_articles(self, url):
soup = self.index_to_soup(url)
articles = []
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
container = a.findParent(['p'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
title = self.tag_to_string(a).strip()
url = a['href']
desc = ''
if container is not None:
desc = self.tag_to_string(container)
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound WN article:', title)
return articles
def wsj_find_articles(self, url):
soup = self.index_to_soup(url)
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
whats_news.extract()
articles = []
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
if flavorarea is not None:
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
if flavorstory is not None:
flavorstory['class'] = 'mjLinkItem'
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
if metapage is not None:
flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
meta = self.tag_to_string(meta).strip()
if meta:
title = self.tag_to_string(a).strip() + ' [%s]'%meta
else:
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a['href'])
desc = ''
for p in container.findAll('p'):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound article:', title)
return articles
def test_wsj_index(self):
return {'index': [
('Testing', [
{'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
{'title': 'Article Two',
'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
{'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]),
]}