mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Update WSJ
This commit is contained in:
		
							parent
							
								
									63f66f0f38
								
							
						
					
					
						commit
						0d4ee405c3
					
				@ -33,6 +33,19 @@ except ImportError:
 | 
				
			|||||||
needs_subscription = True
 | 
					needs_subscription = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def substring_classes(classes):
 | 
				
			||||||
 | 
					    q = frozenset(classes.split(' '))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def matcher(x):
 | 
				
			||||||
 | 
					        if x:
 | 
				
			||||||
 | 
					            for candidate in frozenset(x.split()):
 | 
				
			||||||
 | 
					                for x in q:
 | 
				
			||||||
 | 
					                    if x in candidate:
 | 
				
			||||||
 | 
					                        return True
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    return {'attrs': {'class': matcher}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class WSJ(BasicNewsRecipe):
 | 
					class WSJ(BasicNewsRecipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if needs_subscription:
 | 
					    if needs_subscription:
 | 
				
			||||||
@ -54,10 +67,11 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 | 
					    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    keep_only_tags = [
 | 
					    keep_only_tags = [
 | 
				
			||||||
 | 
					        dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
 | 
				
			||||||
        dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
 | 
					        dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
 | 
				
			||||||
        dict(name='span', itemprop='author', rel='author'),
 | 
					        dict(name='span', itemprop='author', rel='author'),
 | 
				
			||||||
        dict(name='article', id='article-contents articleBody'.split()),
 | 
					        dict(name='article', id='article-contents articleBody'.split()),
 | 
				
			||||||
        dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
 | 
					        dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
 | 
				
			||||||
        classes('nc-exp-artbody errorNotFound'),
 | 
					        classes('nc-exp-artbody errorNotFound'),
 | 
				
			||||||
        dict(attrs={'data-module-zone': 'article_snippet'}),
 | 
					        dict(attrs={'data-module-zone': 'article_snippet'}),
 | 
				
			||||||
        prefixed_classes(
 | 
					        prefixed_classes(
 | 
				
			||||||
@ -199,9 +213,11 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
            return br
 | 
					            return br
 | 
				
			||||||
    # }}}
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def abs_wsj_url(self, href):
 | 
					    def abs_wsj_url(self, href, modify_query=True):
 | 
				
			||||||
        if not href.startswith('http'):
 | 
					        if not href.startswith('http'):
 | 
				
			||||||
            href = 'https://www.wsj.com' + href
 | 
					            href = 'https://www.wsj.com' + href
 | 
				
			||||||
 | 
					        if modify_query:
 | 
				
			||||||
 | 
					            href = href.split('?')[0] + '?mod=djemalertNEWS'
 | 
				
			||||||
        return href
 | 
					        return href
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def wsj_find_articles(self, url, ahed=False):
 | 
					    def wsj_find_articles(self, url, ahed=False):
 | 
				
			||||||
@ -224,7 +240,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
            articles.append({'title': title, 'url': url,
 | 
					            articles.append({'title': title, 'url': url,
 | 
				
			||||||
                             'description': desc, 'date': ''})
 | 
					                             'description': desc, 'date': ''})
 | 
				
			||||||
            self.log('\tFound article:', title)
 | 
					            self.log('\tFound article:', title)
 | 
				
			||||||
            self.log('\t\t', desc)
 | 
					            self.log('\t\t', desc + " " + url)
 | 
				
			||||||
            if self.test and len(articles) >= self.test[1]:
 | 
					            if self.test and len(articles) >= self.test[1]:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -250,7 +266,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
                                'description': desc, 'date': ''})
 | 
					                                'description': desc, 'date': ''})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.log('\tFound WN article:', title)
 | 
					            self.log('\tFound WN article:', title)
 | 
				
			||||||
            self.log('\t\t', desc)
 | 
					            self.log('\t\t', desc + " " + url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return articles
 | 
					        return articles
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -290,7 +306,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
                title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
 | 
					                title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
 | 
				
			||||||
                if not title:
 | 
					                if not title:
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
                url = self.abs_wsj_url(a.get('href'))
 | 
					                url = self.abs_wsj_url(a.get('href'), modify_query=False)
 | 
				
			||||||
                self.log('Found section:', title, 'at', url)
 | 
					                self.log('Found section:', title, 'at', url)
 | 
				
			||||||
                self.wsj_add_feed(feeds, title, url)
 | 
					                self.wsj_add_feed(feeds, title, url)
 | 
				
			||||||
                if frontpage:
 | 
					                if frontpage:
 | 
				
			||||||
@ -304,7 +320,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
    def test_wsj_index(self):
 | 
					    def test_wsj_index(self):
 | 
				
			||||||
        return [
 | 
					        return [
 | 
				
			||||||
            ('Testing', [
 | 
					            ('Testing', [
 | 
				
			||||||
                {'title': 'Article One',
 | 
					                {'title': 'Subscriber Article',
 | 
				
			||||||
                 'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'},  # noqa
 | 
					                 'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
 | 
				
			||||||
            ]),
 | 
					            ]),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
				
			|||||||
@ -33,6 +33,19 @@ except ImportError:
 | 
				
			|||||||
needs_subscription = False
 | 
					needs_subscription = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def substring_classes(classes):
 | 
				
			||||||
 | 
					    q = frozenset(classes.split(' '))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def matcher(x):
 | 
				
			||||||
 | 
					        if x:
 | 
				
			||||||
 | 
					            for candidate in frozenset(x.split()):
 | 
				
			||||||
 | 
					                for x in q:
 | 
				
			||||||
 | 
					                    if x in candidate:
 | 
				
			||||||
 | 
					                        return True
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    return {'attrs': {'class': matcher}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class WSJ(BasicNewsRecipe):
 | 
					class WSJ(BasicNewsRecipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if needs_subscription:
 | 
					    if needs_subscription:
 | 
				
			||||||
@ -54,10 +67,11 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 | 
					    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    keep_only_tags = [
 | 
					    keep_only_tags = [
 | 
				
			||||||
 | 
					        dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
 | 
				
			||||||
        dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
 | 
					        dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
 | 
				
			||||||
        dict(name='span', itemprop='author', rel='author'),
 | 
					        dict(name='span', itemprop='author', rel='author'),
 | 
				
			||||||
        dict(name='article', id='article-contents articleBody'.split()),
 | 
					        dict(name='article', id='article-contents articleBody'.split()),
 | 
				
			||||||
        dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
 | 
					        dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
 | 
				
			||||||
        classes('nc-exp-artbody errorNotFound'),
 | 
					        classes('nc-exp-artbody errorNotFound'),
 | 
				
			||||||
        dict(attrs={'data-module-zone': 'article_snippet'}),
 | 
					        dict(attrs={'data-module-zone': 'article_snippet'}),
 | 
				
			||||||
        prefixed_classes(
 | 
					        prefixed_classes(
 | 
				
			||||||
@ -199,9 +213,11 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
            return br
 | 
					            return br
 | 
				
			||||||
    # }}}
 | 
					    # }}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def abs_wsj_url(self, href):
 | 
					    def abs_wsj_url(self, href, modify_query=True):
 | 
				
			||||||
        if not href.startswith('http'):
 | 
					        if not href.startswith('http'):
 | 
				
			||||||
            href = 'https://www.wsj.com' + href
 | 
					            href = 'https://www.wsj.com' + href
 | 
				
			||||||
 | 
					        if modify_query:
 | 
				
			||||||
 | 
					            href = href.split('?')[0] + '?mod=djemalertNEWS'
 | 
				
			||||||
        return href
 | 
					        return href
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def wsj_find_articles(self, url, ahed=False):
 | 
					    def wsj_find_articles(self, url, ahed=False):
 | 
				
			||||||
@ -224,7 +240,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
            articles.append({'title': title, 'url': url,
 | 
					            articles.append({'title': title, 'url': url,
 | 
				
			||||||
                             'description': desc, 'date': ''})
 | 
					                             'description': desc, 'date': ''})
 | 
				
			||||||
            self.log('\tFound article:', title)
 | 
					            self.log('\tFound article:', title)
 | 
				
			||||||
            self.log('\t\t', desc)
 | 
					            self.log('\t\t', desc + " " + url)
 | 
				
			||||||
            if self.test and len(articles) >= self.test[1]:
 | 
					            if self.test and len(articles) >= self.test[1]:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -250,7 +266,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
                                'description': desc, 'date': ''})
 | 
					                                'description': desc, 'date': ''})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.log('\tFound WN article:', title)
 | 
					            self.log('\tFound WN article:', title)
 | 
				
			||||||
            self.log('\t\t', desc)
 | 
					            self.log('\t\t', desc + " " + url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return articles
 | 
					        return articles
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -290,7 +306,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
                title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
 | 
					                title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
 | 
				
			||||||
                if not title:
 | 
					                if not title:
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
                url = self.abs_wsj_url(a.get('href'))
 | 
					                url = self.abs_wsj_url(a.get('href'), modify_query=False)
 | 
				
			||||||
                self.log('Found section:', title, 'at', url)
 | 
					                self.log('Found section:', title, 'at', url)
 | 
				
			||||||
                self.wsj_add_feed(feeds, title, url)
 | 
					                self.wsj_add_feed(feeds, title, url)
 | 
				
			||||||
                if frontpage:
 | 
					                if frontpage:
 | 
				
			||||||
@ -304,7 +320,7 @@ class WSJ(BasicNewsRecipe):
 | 
				
			|||||||
    def test_wsj_index(self):
 | 
					    def test_wsj_index(self):
 | 
				
			||||||
        return [
 | 
					        return [
 | 
				
			||||||
            ('Testing', [
 | 
					            ('Testing', [
 | 
				
			||||||
                {'title': 'Article One',
 | 
					                {'title': 'Subscriber Article',
 | 
				
			||||||
                 'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'},  # noqa
 | 
					                 'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
 | 
				
			||||||
            ]),
 | 
					            ]),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user