mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Update WSJ
This commit is contained in:
parent
63f66f0f38
commit
0d4ee405c3
@ -33,6 +33,19 @@ except ImportError:
|
|||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
|
|
||||||
|
def substring_classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
|
||||||
|
def matcher(x):
|
||||||
|
if x:
|
||||||
|
for candidate in frozenset(x.split()):
|
||||||
|
for x in q:
|
||||||
|
if x in candidate:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
return {'attrs': {'class': matcher}}
|
||||||
|
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
|
|
||||||
if needs_subscription:
|
if needs_subscription:
|
||||||
@ -54,10 +67,11 @@ class WSJ(BasicNewsRecipe):
|
|||||||
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
|
dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
|
||||||
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
|
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
|
||||||
dict(name='span', itemprop='author', rel='author'),
|
dict(name='span', itemprop='author', rel='author'),
|
||||||
dict(name='article', id='article-contents articleBody'.split()),
|
dict(name='article', id='article-contents articleBody'.split()),
|
||||||
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
|
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
|
||||||
classes('nc-exp-artbody errorNotFound'),
|
classes('nc-exp-artbody errorNotFound'),
|
||||||
dict(attrs={'data-module-zone': 'article_snippet'}),
|
dict(attrs={'data-module-zone': 'article_snippet'}),
|
||||||
prefixed_classes(
|
prefixed_classes(
|
||||||
@ -199,9 +213,11 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def abs_wsj_url(self, href):
|
def abs_wsj_url(self, href, modify_query=True):
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = 'https://www.wsj.com' + href
|
href = 'https://www.wsj.com' + href
|
||||||
|
if modify_query:
|
||||||
|
href = href.split('?')[0] + '?mod=djemalertNEWS'
|
||||||
return href
|
return href
|
||||||
|
|
||||||
def wsj_find_articles(self, url, ahed=False):
|
def wsj_find_articles(self, url, ahed=False):
|
||||||
@ -224,7 +240,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
articles.append({'title': title, 'url': url,
|
articles.append({'title': title, 'url': url,
|
||||||
'description': desc, 'date': ''})
|
'description': desc, 'date': ''})
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
self.log('\t\t', desc)
|
self.log('\t\t', desc + " " + url)
|
||||||
if self.test and len(articles) >= self.test[1]:
|
if self.test and len(articles) >= self.test[1]:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -250,7 +266,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
'description': desc, 'date': ''})
|
'description': desc, 'date': ''})
|
||||||
|
|
||||||
self.log('\tFound WN article:', title)
|
self.log('\tFound WN article:', title)
|
||||||
self.log('\t\t', desc)
|
self.log('\t\t', desc + " " + url)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@ -290,7 +306,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'), modify_query=False)
|
||||||
self.log('Found section:', title, 'at', url)
|
self.log('Found section:', title, 'at', url)
|
||||||
self.wsj_add_feed(feeds, title, url)
|
self.wsj_add_feed(feeds, title, url)
|
||||||
if frontpage:
|
if frontpage:
|
||||||
@ -304,7 +320,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
def test_wsj_index(self):
|
def test_wsj_index(self):
|
||||||
return [
|
return [
|
||||||
('Testing', [
|
('Testing', [
|
||||||
{'title': 'Article One',
|
{'title': 'Subscriber Article',
|
||||||
'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'}, # noqa
|
'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
|
||||||
]),
|
]),
|
||||||
]
|
]
|
||||||
|
@ -33,6 +33,19 @@ except ImportError:
|
|||||||
needs_subscription = False
|
needs_subscription = False
|
||||||
|
|
||||||
|
|
||||||
|
def substring_classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
|
||||||
|
def matcher(x):
|
||||||
|
if x:
|
||||||
|
for candidate in frozenset(x.split()):
|
||||||
|
for x in q:
|
||||||
|
if x in candidate:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
return {'attrs': {'class': matcher}}
|
||||||
|
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
|
|
||||||
if needs_subscription:
|
if needs_subscription:
|
||||||
@ -54,10 +67,11 @@ class WSJ(BasicNewsRecipe):
|
|||||||
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
|
dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
|
||||||
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
|
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
|
||||||
dict(name='span', itemprop='author', rel='author'),
|
dict(name='span', itemprop='author', rel='author'),
|
||||||
dict(name='article', id='article-contents articleBody'.split()),
|
dict(name='article', id='article-contents articleBody'.split()),
|
||||||
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
|
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
|
||||||
classes('nc-exp-artbody errorNotFound'),
|
classes('nc-exp-artbody errorNotFound'),
|
||||||
dict(attrs={'data-module-zone': 'article_snippet'}),
|
dict(attrs={'data-module-zone': 'article_snippet'}),
|
||||||
prefixed_classes(
|
prefixed_classes(
|
||||||
@ -199,9 +213,11 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def abs_wsj_url(self, href):
|
def abs_wsj_url(self, href, modify_query=True):
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = 'https://www.wsj.com' + href
|
href = 'https://www.wsj.com' + href
|
||||||
|
if modify_query:
|
||||||
|
href = href.split('?')[0] + '?mod=djemalertNEWS'
|
||||||
return href
|
return href
|
||||||
|
|
||||||
def wsj_find_articles(self, url, ahed=False):
|
def wsj_find_articles(self, url, ahed=False):
|
||||||
@ -224,7 +240,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
articles.append({'title': title, 'url': url,
|
articles.append({'title': title, 'url': url,
|
||||||
'description': desc, 'date': ''})
|
'description': desc, 'date': ''})
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
self.log('\t\t', desc)
|
self.log('\t\t', desc + " " + url)
|
||||||
if self.test and len(articles) >= self.test[1]:
|
if self.test and len(articles) >= self.test[1]:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -250,7 +266,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
'description': desc, 'date': ''})
|
'description': desc, 'date': ''})
|
||||||
|
|
||||||
self.log('\tFound WN article:', title)
|
self.log('\tFound WN article:', title)
|
||||||
self.log('\t\t', desc)
|
self.log('\t\t', desc + " " + url)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@ -290,7 +306,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'), modify_query=False)
|
||||||
self.log('Found section:', title, 'at', url)
|
self.log('Found section:', title, 'at', url)
|
||||||
self.wsj_add_feed(feeds, title, url)
|
self.wsj_add_feed(feeds, title, url)
|
||||||
if frontpage:
|
if frontpage:
|
||||||
@ -304,7 +320,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
def test_wsj_index(self):
|
def test_wsj_index(self):
|
||||||
return [
|
return [
|
||||||
('Testing', [
|
('Testing', [
|
||||||
{'title': 'Article One',
|
{'title': 'Subscriber Article',
|
||||||
'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'}, # noqa
|
'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
|
||||||
]),
|
]),
|
||||||
]
|
]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user