From 7028b7ab18f9634135c72029aefe6b51dfa131a0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 9 Oct 2014 10:02:43 +0530 Subject: [PATCH] Update WSJ --- recipes/wsj.recipe | 16 ++++++++++++---- recipes/wsj_free.recipe | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index 8a0de0b381..433a4709e8 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe -import copy +import copy, re # http://online.wsj.com/page/us_in_todays_paper.html @@ -22,6 +22,7 @@ class WallStreetJournal(BasicNewsRecipe): timefmt = ' [%a, %b %d, %Y]' no_stylesheets = True ignore_duplicate_articles = {'url'} + remove_attributes = ['style', 'data-scrim'] keep_only_tags = [ dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}), @@ -29,12 +30,16 @@ class WallStreetJournal(BasicNewsRecipe): dict(name='article', id=['article-contents', 'articleBody']), dict(name='div', id='article_story_body'), dict(name='div', attrs={'class':'snippet-ad-login'}), - dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}), ] remove_tags = [ - dict(attrs={'class':['insetButton', 'insettipBox']}), + dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}), + dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}), dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}), ] + preprocess_regexps = [ + (re.compile(r'', re.DOTALL), lambda m: ''), + (re.compile(r'.+?', re.DOTALL), lambda m:''), + ] use_javascript_to_login = True @@ -47,7 +52,7 @@ class WallStreetJournal(BasicNewsRecipe): def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): - picdiv = soup.find('img') + picdiv = soup.find('img', src=True) if picdiv is not None: self.add_toc_thumbnail(article,picdiv['src']) @@ -57,6 +62,9 @@ class WallStreetJournal(BasicNewsRecipe): img = div.find('img') if img is not None: img.extract() + # Use large images + for img in soup.findAll('img', attrs={'data-enlarge':True}): + img['src'] = img['data-enlarge'] return soup diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 34495a36d5..3e00480971 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe -import copy +import copy, re class WallStreetJournal(BasicNewsRecipe): @@ -20,6 +20,7 @@ class WallStreetJournal(BasicNewsRecipe): timefmt = ' [%a, %b %d, %Y]' no_stylesheets = True ignore_duplicate_articles = {'url'} + remove_attributes = ['style', 'data-scrim'] keep_only_tags = [ dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}), @@ -27,16 +28,20 @@ class WallStreetJournal(BasicNewsRecipe): dict(name='article', id=['article-contents', 'articleBody']), dict(name='div', id='article_story_body'), dict(name='div', attrs={'class':'snippet-ad-login'}), - dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}), ] remove_tags = [ - dict(attrs={'class':['insetButton', 'insettipBox']}), + dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}), + dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}), dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}), ] + preprocess_regexps = [ + (re.compile(r'', re.DOTALL), lambda m: ''), + (re.compile(r'.+?', re.DOTALL), lambda m:''), + ] def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): - picdiv = soup.find('img') + picdiv = soup.find('img', src=True) if picdiv is not None: self.add_toc_thumbnail(article,picdiv['src']) @@ -46,6 +51,9 @@ class WallStreetJournal(BasicNewsRecipe): img = div.find('img') if img is not None: img.extract() + # Use large images + for img in soup.findAll('img', attrs={'data-enlarge':True}): + img['src'] = img['data-enlarge'] return soup