calibre/recipes/wsj.recipe
2015-02-22 14:50:04 +05:30

199 lines
8.0 KiB
Plaintext

from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.web.jsbrowser.browser import NotAFile
def CSSSelect(expr):
expr = {
'div.whatsNews-simple': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ')]''',
'a.mjLinkItem[href]': '''descendant-or-self::a[@class and contains(concat(' ', normalize-space(@class), ' '), ' mjLinkItem ') and (@href)]''',
'.meta_sectionName': '''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' meta_sectionName ')]''',
'p':'descendant-or-self::p',
'div.whatsNews-simple.whatsNews-itp': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-itp '))]''',
'a[href]': 'descendant-or-self::a[@href]',
'span.date-date':"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' date-date ')]",
'div.itpSectionHeaderPdf a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpSectionHeaderPdf ')]/descendant-or-self::*/a[@href]",
'div.itpHeader ul.tab a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpHeader ')]/descendant-or-self::*/ul[@class and contains(concat(' ', normalize-space(@class), ' '), ' tab ')]/descendant-or-self::*/a[@href]",
}[expr]
from lxml.etree import XPath
return XPath(expr)
class WSJ(JavascriptRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal'
description = 'News and current affairs'
language = 'en'
compress_news_images = True
compress_news_images_auto_size = 7
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim']
needs_subscription = True
keep_only_tags = (
'h1', # 'h2.subhead', 'h2.subHed.deck',
'span[itemprop=author][rel=author]',
'article#article-contents', 'article#articleBody',
'div#article_story_body',
# Parallax formatting
'div#ncTitleArea', 'section.nc-exp-artbody',
# Error conditions, login required and page not found
'div#snippet-ad-login', 'div.errorNotFound',
)
remove_tags = (
'.insetButton', '.insettipBox', '.author-info', '.media-object-video',
'.article_tools', 'span[data-country-code][data-ticker-code]',
'div.nc-exp-artmeta',
)
def do_login(self, br, username, password):
br.visit(
'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) # noqa
f = br.select_form(nr=0)
f['username'] = username
f['password'] = password
br.submit(timeout=120)
def preprocess_stage2(self, article, browser, url, recursion_level):
# Slideshow and expandable images need to be processed here to
# set the src attribute correctly
found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
found += 1
for img in browser.css_select('img[data-enlarge]', all=True):
img.setAttribute('src', img.attribute('data-enlarge'))
found += 1
if found:
self.log.debug('Found %d dynamic images in:' % found, url)
def get_publication_data(self, browser):
return self.get_wsj_index(browser)
def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def wsj_find_articles(self, url):
root = self.index_to_soup(url)
for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x)
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
meta = meta[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
if meta:
title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url)
articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
if whats_news:
for a in CSSSelect('a[href]')(whats_news[-1]):
if '/articles/' not in a.get('href', ''):
continue
container = a.xpath('ancestor::p')
for meta in CSSSelect('.meta_sectionName')(a):
meta.getparent().remove(meta)
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
desc = self.tag_to_string(container[0])
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title)
try:
if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url)
else:
articles = self.wsj_find_articles(url)
except:
articles = []
if articles:
feeds.append((title, articles))
def get_wsj_index(self, browser):
# return self.test_wsj_index()
ans = {}
root = self.index_to_soup('http://online.wsj.com/itp')
for span in CSSSelect('span.date-date')(root):
if span.text:
self.timefmt = span.text
break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href')
try:
ans['cover'] = browser.download_file(href)
except NotAFile:
break
break
feeds = ans['index'] = []
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''):
continue
pageone = a.get('href').endswith('pageone')
if pageone:
title = 'Front Section'
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
title = "What's News"
url = url.replace('pageone', 'whatsnews')
self.wsj_add_feed(feeds, title, url)
else:
title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
return ans
def test_wsj_index(self):
return {'index': [
('Testing', [
{'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
{'title': 'Article Two',
'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
{'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]),
]}