Update WSJ

2025-06-23 15:30:45 -04:00 · 2014-10-09 10:02:43 +05:30 · 2014-10-09 10:02:43 +05:30 · 7028b7ab18
commit 7028b7ab18
parent 4404b6ff95
2 changed files with 24 additions and 8 deletions
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'

 from calibre.web.feeds.news import BasicNewsRecipe
-import copy
+import copy, re

 # http://online.wsj.com/page/us_in_todays_paper.html

@ -22,6 +22,7 @@ class WallStreetJournal(BasicNewsRecipe):
    timefmt  = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
+    remove_attributes = ['style', 'data-scrim']

    keep_only_tags = [
        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
@ -29,12 +30,16 @@ class WallStreetJournal(BasicNewsRecipe):
        dict(name='article', id=['article-contents', 'articleBody']),
        dict(name='div', id='article_story_body'),
        dict(name='div', attrs={'class':'snippet-ad-login'}),
-        dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
    ]
    remove_tags = [
-        dict(attrs={'class':['insetButton', 'insettipBox']}),
+        dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
+        dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]
+    preprocess_regexps = [
+        (re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
+        (re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
+    ]

    use_javascript_to_login = True

@ -47,7 +52,7 @@ class WallStreetJournal(BasicNewsRecipe):

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
-            picdiv = soup.find('img')
+            picdiv = soup.find('img', src=True)
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])

@ -57,6 +62,9 @@ class WallStreetJournal(BasicNewsRecipe):
            img = div.find('img')
            if img is not None:
                img.extract()
+        # Use large images
+        for img in soup.findAll('img', attrs={'data-enlarge':True}):
+            img['src'] = img['data-enlarge']

        return soup

--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'

 from calibre.web.feeds.news import BasicNewsRecipe
-import copy
+import copy, re

 class WallStreetJournal(BasicNewsRecipe):

@ -20,6 +20,7 @@ class WallStreetJournal(BasicNewsRecipe):
    timefmt  = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
+    remove_attributes = ['style', 'data-scrim']

    keep_only_tags = [
        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
@ -27,16 +28,20 @@ class WallStreetJournal(BasicNewsRecipe):
        dict(name='article', id=['article-contents', 'articleBody']),
        dict(name='div', id='article_story_body'),
        dict(name='div', attrs={'class':'snippet-ad-login'}),
-        dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
    ]
    remove_tags = [
-        dict(attrs={'class':['insetButton', 'insettipBox']}),
+        dict(attrs={'class':['insetButton', 'insettipBox', 'author-info', 'media-object-video']}),
+        dict(attrs={'class':lambda x: x and 'article_tools' in x.split()}),
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]
+    preprocess_regexps = [
+        (re.compile(r'<!--\[if lte IE 8\]>.+?<!\[endif\]-->', re.DOTALL), lambda m: ''),
+        (re.compile(r'<!\[if ! lte IE 8\]>.+?<!\[endif\]>', re.DOTALL), lambda m:''),
+    ]

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
-            picdiv = soup.find('img')
+            picdiv = soup.find('img', src=True)
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])

@ -46,6 +51,9 @@ class WallStreetJournal(BasicNewsRecipe):
            img = div.find('img')
            if img is not None:
                img.extract()
+        # Use large images
+        for img in soup.findAll('img', attrs={'data-enlarge':True}):
+            img['src'] = img['data-enlarge']

        return soup