calibre/recipes/wsj_free.recipe
2015-10-11 00:09:43 +05:30

207 lines
8.6 KiB
Plaintext

from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.web.jsbrowser.browser import NotAFile
def CSSSelect(expr):
expr = {
'div.whatsNews-simple': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ')]''',
'a.mjLinkItem[href]': '''descendant-or-self::a[@class and contains(concat(' ', normalize-space(@class), ' '), ' mjLinkItem ') and (@href)]''',
'.meta_sectionName': '''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' meta_sectionName ')]''',
'p':'descendant-or-self::p',
'div.whatsNews-simple.whatsNews-itp': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-itp '))]''',
'a[href]': 'descendant-or-self::a[@href]',
'span.date-date':"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' date-date ')]",
'div.itpSectionHeaderPdf a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpSectionHeaderPdf ')]/descendant-or-self::*/a[@href]",
'div.itpHeader ul.tab a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpHeader ')]/descendant-or-self::*/ul[@class and contains(concat(' ', normalize-space(@class), ' '), ' tab ')]/descendant-or-self::*/a[@href]",
}[expr]
from lxml.etree import XPath
return XPath(expr)
class WSJ(JavascriptRecipe):
title = 'Wall Street Journal (free)'
__author__ = 'Kovid Goyal'
description = '''News and current affairs. This recipe only fetches complete
versions of the articles that are available free on the wsj.com website.
To get the rest of the articles, subscribe to the WSJ and use the other WSJ
recipe.'''
language = 'en'
compress_news_images = True
compress_news_images_auto_size = 5
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim']
keep_only_tags = (
'h1', # 'h2.subhead', 'h2.subHed.deck',
'span[itemprop=author][rel=author]',
'article#article-contents', 'article#articleBody',
'div#article_story_body', 'header.article_header',
# Parallax formatting
'div#ncTitleArea', 'section.nc-exp-artbody',
# Error conditions, login required and page not found
'div#snippet-ad-login', 'div.wsj-snippet-body', 'div.wsj-snippet-login', 'div.errorNotFound',
)
remove_tags = (
'.insetButton', '.insettipBox', '.author-info', '.media-object-video',
'.article_tools', 'span[data-country-code][data-ticker-code]',
'div.nc-exp-artmeta',
)
def preprocess_stage2(self, article, browser, url, recursion_level):
# Slideshow and expandable images need to be processed here to
# set the src attribute correctly
found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
found += 1
for img in browser.css_select('img[data-enlarge]', all=True):
img.setAttribute('src', img.attribute('data-enlarge'))
found += 1
if found:
self.log.debug('Found %d dynamic images in:' % found, url)
def get_publication_data(self, browser):
return self.get_wsj_index(browser)
def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def wsj_find_articles(self, url, ahed=False):
root = self.index_to_soup(url)
for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x)
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
meta = meta[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
if meta:
title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
if ahed:
for h2 in root.xpath('//li[@class="ahed_listitem"]/h2'):
a = h2.xpath('descendant::a')[0]
title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href'))
desc = ''
p = h2.xpath('following-sibling::p')
if p:
desc = self.tag_to_string(p[0])
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
self.log('Found article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url)
articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
if whats_news:
for a in CSSSelect('a[href]')(whats_news[-1]):
if '/articles/' not in a.get('href', ''):
continue
container = a.xpath('ancestor::p')
for meta in CSSSelect('.meta_sectionName')(a):
meta.getparent().remove(meta)
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
desc = self.tag_to_string(container[0])
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title)
try:
if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url)
else:
articles = self.wsj_find_articles(url, ahed=title == 'Front Section')
except:
articles = []
if articles:
feeds.append((title, articles))
def get_wsj_index(self, browser):
# return self.test_wsj_index()
ans = {}
root = self.index_to_soup('http://online.wsj.com/itp/today')
for span in CSSSelect('span.date-date')(root):
if span.text:
self.timefmt = span.text
break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href')
try:
ans['cover'] = browser.download_file(href)
except NotAFile:
break
break
feeds = ans['index'] = []
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''):
continue
pageone = a.get('href').endswith('pageone')
if pageone:
title = 'Front Section'
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
title = "What's News"
url = url.replace('pageone', 'whatsnews')
self.wsj_add_feed(feeds, title, url)
else:
title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url)
return ans
def test_wsj_index(self):
return {'index': [
('Testing', [
{'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
{'title': 'Article Two',
'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
{'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]),
]}