mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
194 lines
8.0 KiB
Plaintext
194 lines
8.0 KiB
Plaintext
from calibre.web.feeds.jsnews import JavascriptRecipe
|
|
from calibre.web.jsbrowser.browser import NotAFile
|
|
|
|
def CSSSelect(expr):
|
|
expr = {
|
|
'div.whatsNews-simple': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ')]''',
|
|
'a.mjLinkItem[href]': '''descendant-or-self::a[@class and contains(concat(' ', normalize-space(@class), ' '), ' mjLinkItem ') and (@href)]''',
|
|
'.meta_sectionName': '''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' meta_sectionName ')]''',
|
|
'p':'descendant-or-self::p',
|
|
'div.whatsNews-simple.whatsNews-itp': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-itp '))]''',
|
|
'a[href]': 'descendant-or-self::a[@href]',
|
|
'span.date-date':"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' date-date ')]",
|
|
'div.itpSectionHeaderPdf a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpSectionHeaderPdf ')]/descendant-or-self::*/a[@href]",
|
|
'div.itpHeader ul.tab a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpHeader ')]/descendant-or-self::*/ul[@class and contains(concat(' ', normalize-space(@class), ' '), ' tab ')]/descendant-or-self::*/a[@href]",
|
|
|
|
}[expr]
|
|
from lxml.etree import XPath
|
|
return XPath(expr)
|
|
|
|
|
|
class WSJ(JavascriptRecipe):
|
|
|
|
title = 'Wall Street Journal (free)'
|
|
__author__ = 'Kovid Goyal'
|
|
description = '''News and current affairs. This recipe only fetches complete
|
|
versions of the articles that are available free on the wsj.com website.
|
|
To get the rest of the articles, subscribe to the WSJ and use the other WSJ
|
|
recipe.'''
|
|
|
|
language = 'en'
|
|
|
|
compress_news_images = True
|
|
compress_news_images_auto_size = 5
|
|
max_articles_per_feed = 1000
|
|
timefmt = ' [%a, %b %d, %Y]'
|
|
no_stylesheets = True
|
|
ignore_duplicate_articles = {'url'}
|
|
remove_attributes = ['style', 'data-scrim']
|
|
|
|
keep_only_tags = (
|
|
'h1', # 'h2.subhead', 'h2.subHed.deck',
|
|
'span[itemprop=author][rel=author]',
|
|
'article#article-contents', 'article#articleBody',
|
|
'div#article_story_body', 'header.article_header',
|
|
# Parallax formatting
|
|
'div#ncTitleArea', 'section.nc-exp-artbody',
|
|
# Error conditions, login required and page not found
|
|
'div#snippet-ad-login', 'div.wsj-snippet-body', 'div.wsj-snippet-login', 'div.errorNotFound',
|
|
)
|
|
|
|
remove_tags = (
|
|
'.insetButton', '.insettipBox', '.author-info', '.media-object-video',
|
|
'.article_tools', 'span[data-country-code][data-ticker-code]',
|
|
'div.nc-exp-artmeta',
|
|
)
|
|
|
|
def preprocess_stage2(self, article, browser, url, recursion_level):
|
|
# Slideshow and expandable images need to be processed here to
|
|
# set the src attribute correctly
|
|
found = 0
|
|
for img in browser.css_select('img[data-in-base-data-lazy]', all=True):
|
|
img.setAttribute('src', img.attribute('data-in-base-data-lazy'))
|
|
found += 1
|
|
for img in browser.css_select('img[data-enlarge]', all=True):
|
|
img.setAttribute('src', img.attribute('data-enlarge'))
|
|
found += 1
|
|
if found:
|
|
self.log.debug('Found %d dynamic images in:' % found, url)
|
|
|
|
def get_publication_data(self, browser):
|
|
return self.get_wsj_index(browser)
|
|
|
|
def abs_wsj_url(self, href):
|
|
if not href.startswith('http'):
|
|
href = 'http://online.wsj.com' + href
|
|
return href
|
|
|
|
def wsj_find_articles(self, url):
|
|
root = self.index_to_soup(url)
|
|
|
|
for x in CSSSelect('div.whatsNews-simple')(root):
|
|
x.getparent().remove(x)
|
|
|
|
articles = []
|
|
|
|
for a in CSSSelect('a.mjLinkItem[href]')(root):
|
|
container = a.xpath('ancestor::li')
|
|
meta = CSSSelect('.meta_sectionName')(a)
|
|
if meta:
|
|
meta = meta[0]
|
|
meta.getparent().remove(meta)
|
|
meta = self.tag_to_string(meta)
|
|
title = self.tag_to_string(a)
|
|
if meta:
|
|
title += ' [%s]' % meta
|
|
url = self.abs_wsj_url(a.get('href'))
|
|
desc = ''
|
|
if container:
|
|
for p in CSSSelect('p')(container[0]):
|
|
desc = self.tag_to_string(p)
|
|
if 'Subscriber Content' not in desc:
|
|
break
|
|
|
|
articles.append({'title': title, 'url': url,
|
|
'description': desc, 'date': ''})
|
|
|
|
self.log('\tFound article:', title)
|
|
self.log('\t\t', desc)
|
|
return articles
|
|
|
|
def wsj_find_wn_articles(self, url):
|
|
root = self.index_to_soup(url)
|
|
articles = []
|
|
|
|
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
|
|
if whats_news:
|
|
for a in CSSSelect('a[href]')(whats_news[-1]):
|
|
if '/articles/' not in a.get('href', ''):
|
|
continue
|
|
container = a.xpath('ancestor::p')
|
|
for meta in CSSSelect('.meta_sectionName')(a):
|
|
meta.getparent().remove(meta)
|
|
title = self.tag_to_string(a).strip()
|
|
url = self.abs_wsj_url(a.get('href'))
|
|
desc = ''
|
|
if container:
|
|
desc = self.tag_to_string(container[0])
|
|
|
|
articles.append({'title': title, 'url': url,
|
|
'description': desc, 'date': ''})
|
|
|
|
self.log('\tFound WN article:', title)
|
|
self.log('\t\t', desc)
|
|
|
|
return articles
|
|
|
|
def wsj_add_feed(self, feeds, title, url):
|
|
self.log('Found section:', title)
|
|
try:
|
|
if url.endswith('whatsnews'):
|
|
articles = self.wsj_find_wn_articles(url)
|
|
else:
|
|
articles = self.wsj_find_articles(url)
|
|
except:
|
|
articles = []
|
|
if articles:
|
|
feeds.append((title, articles))
|
|
|
|
def get_wsj_index(self, browser):
|
|
# return self.test_wsj_index()
|
|
ans = {}
|
|
root = self.index_to_soup('http://online.wsj.com/itp')
|
|
for span in CSSSelect('span.date-date')(root):
|
|
if span.text:
|
|
self.timefmt = span.text
|
|
break
|
|
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
|
|
href = a.get('href')
|
|
try:
|
|
ans['cover'] = browser.download_file(href)
|
|
except NotAFile:
|
|
break
|
|
break
|
|
|
|
feeds = ans['index'] = []
|
|
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
|
|
if '/itp/' not in a.get('href', ''):
|
|
continue
|
|
pageone = a.get('href').endswith('pageone')
|
|
if pageone:
|
|
title = 'Front Section'
|
|
url = self.abs_wsj_url(a.get('href'))
|
|
self.wsj_add_feed(feeds, title, url)
|
|
title = "What's News"
|
|
url = url.replace('pageone', 'whatsnews')
|
|
self.wsj_add_feed(feeds, title, url)
|
|
else:
|
|
title = self.tag_to_string(a)
|
|
url = self.abs_wsj_url(a.get('href'))
|
|
self.wsj_add_feed(feeds, title, url)
|
|
return ans
|
|
|
|
def test_wsj_index(self):
|
|
return {'index': [
|
|
('Testing', [
|
|
{'title': 'Article One',
|
|
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
|
|
{'title': 'Article Two',
|
|
'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa
|
|
{'title': 'Article Three',
|
|
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
|
|
]),
|
|
]}
|