Improved free WSJ recipe

2025-06-23 15:30:45 -04:00 · 2010-01-20 14:58:07 -07:00 · 2010-01-20 14:58:07 -07:00 · 096735a456
commit 096735a456
parent 0b5541edc2
1 changed files with 103 additions and 67 deletions
--- a/resources/recipes/wsj_free.recipe
+++ b/resources/recipes/wsj_free.recipe
@ -3,47 +3,122 @@
 __license__   = 'GPL v3'
 '''
-online.wsj.com.com
+online.wsj.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag, NavigableString
 from datetime import timedelta, datetime, date
 class WSJ(BasicNewsRecipe):
    # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
    title          = u'Wall Street Journal (free)'
    __author__     = 'Nick Redding'
    language = 'en'
-    description = ('All the free content from the Wall Street Journal (business'
+    description = ('All the free content from the Wall Street Journal (business, financial and political news)')
-            ', financial and political news)')
+
    no_stylesheets = True
    timefmt = ' [%b %d]'
-    extra_css   = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
+
-                    h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
+    # customization notes: delete sections you are not interested in
-                    .subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
+    # set omit_paid_content to False if you want the paid content article snippets
-                    .insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;}
+    # set oldest_article to the maximum number of days back from today to include articles
-                    .targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
+    sectionlist = [
-                    .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
+                        ['/home-page','Front Page'],
-                    .tagline { ont-size:xx-small;}
+                        ['/public/page/news-opinion-commentary.html','Commentary'],
-                    .dateStamp {font-family:Arial,Helvetica,sans-serif;}
+                        ['/public/page/news-global-world.html','World News'],
-                    h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+                        ['/public/page/news-world-business.html','US News'],
-                    .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;}
+                        ['/public/page/news-business-us.html','Business'],
                        ['/public/page/news-financial-markets-stock.html','Markets'],
                        ['/public/page/news-tech-technology.html','Technology'],
                        ['/public/page/news-personal-finance.html','Personal Finnce'],
                        ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
                        ['/public/page/news-real-estate-homes.html','Real Estate'],
                        ['/public/page/news-career-jobs.html','Careers'],
                        ['/public/page/news-small-business-marketing.html','Small Business']
                    ]
    oldest_article = 2
    omit_paid_content = True
    extra_css   = '''h1{font-size:large; font-family:Times,serif;}
                    h2{font-family:Times,serif; font-size:small; font-style:italic;}
                    .subhead{font-family:Times,serif; font-size:small; font-style:italic;}
                    .insettipUnit {font-family:Times,serif;font-size:xx-small;}
                    .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
                    .article{font-family:Times,serif; font-size:x-small;}
                    .tagline { font-size:xx-small;}
                    .dateStamp {font-family:Times,serif;}
                    h3{font-family:Times,serif; font-size:xx-small;}
                    .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
                    .metadataType-articleCredits {list-style-type: none;}
-                    h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;}
+                    h6{font-family:Times,serif; font-size:small; font-style:italic;}
                    .paperLocation{font-size:xx-small;}'''
-    remove_tags_before = dict(name='h1')
+
-    remove_tags =   [   dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
+    remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
-                                 "articleTabs_tab_interactive","articleTabs_tab_video",
+    remove_tags =   [   dict({'id':re.compile('^articleTabs_tab_')}),
-                                 "articleTabs_tab_map","articleTabs_tab_slideshow"]),
+                        #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
-			{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
+                        #         "articleTabs_tab_interactive","articleTabs_tab_video",
-			'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip',
+                        #         "articleTabs_tab_map","articleTabs_tab_slideshow"]),
-			'adSummary', 'nav-inline','insetFullBracket']},
+			{'class':  ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
-                        dict(rel='shortcut icon'),
+                                    'insettip','insetClose','more_in', "insetContent",
                        #            'articleTools_bottom','articleTools_bottom mjArticleTools',
                                    'aTools', 'tooltip',
                                    'adSummary', 'nav-inline','insetFullBracket']},
                        dict({'class':re.compile('^articleTools_bottom')}),
                        dict(rel='shortcut icon')
                    ]
    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        return br
    def preprocess_html(self,soup):
        # check if article is too old
        datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
        if datetag:
            dateline_string = self.tag_to_string(datetag,False)
            date_items = dateline_string.split(',')
            datestring = date_items[0]+date_items[1]
            article_date = datetime.strptime(datestring.title(),"%B %d %Y")
            earliest_date = date.today() - timedelta(days=self.oldest_article)
            if article_date.date() < earliest_date:
                self.log("Skipping article dated %s" % datestring)
                return None
            datetag.parent.extract()
            # place dateline in article heading
            bylinetag = soup.find('h3','byline')
            if bylinetag:
                h3bylinetag = bylinetag
            else:
                bylinetag = soup.find('li','byline')
                if bylinetag:
                    h3bylinetag = bylinetag.h3
                    if not h3bylinetag:
                        h3bylinetag = bylinetag
                    bylinetag = bylinetag.parent
            if bylinetag:
                if h3bylinetag.a:
                    bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
                else:
                    bylinetext = self.tag_to_string(h3bylinetag,False)
                h3byline = Tag(soup,'h3',[('class','byline')])
                if bylinetext.isspace() or (bylinetext == ''):
                    h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
                else:
                    h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
                bylinetag.replaceWith(h3byline)
            else:
                headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
                if headlinetag:
                    dateline = Tag(soup,'h3', [('class','byline')])
                    dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
                    headlinetag.insert(len(headlinetag),dateline)
        else: # if no date tag, don't process this page--it's not a news item
            return None
        # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
        ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
        if ultag:
@ -58,7 +133,7 @@ class WSJ(BasicNewsRecipe):
        key = None
        ans = []
-        def parse_index_page(page_name,page_title,omit_paid_content):
+        def parse_index_page(page_name,page_title):
            def article_title(tag):
                atag = tag.find('h2') # title is usually in an h2 tag
@ -119,7 +194,6 @@ class WSJ(BasicNewsRecipe):
            soup = self.index_to_soup(pageurl)
            # Find each instance of div with class including "headlineSummary"
            for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
                # divtag contains all article data as ul's and li's
                # first, check if there is an h3 tag which provides a section name
                stag = divtag.find('h3')
@ -162,7 +236,7 @@ class WSJ(BasicNewsRecipe):
                        # now skip paid subscriber articles if desired
                        subscriber_tag = litag.find(text="Subscriber Content")
                        if subscriber_tag:
-                                if omit_paid_content:
+                                if self.omit_paid_content:
                                    continue
                                # delete the tip div so it doesn't get in the way
                                tiptag = litag.find("div", { "class" : "tipTargetBox" })
@ -185,7 +259,7 @@ class WSJ(BasicNewsRecipe):
                            continue
                        if url.startswith("/article"):
                            url = mainurl+url
-                        if not url.startswith("http"):
+                        if not url.startswith("http://online.wsj.com"):
                            continue
                        if not url.endswith(".html"):
                            continue
@ -214,48 +288,10 @@ class WSJ(BasicNewsRecipe):
                            articles[page_title] = []
                        articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        # customization notes: delete sections you are not interested in
        # set omit_paid_content to False if you want the paid content article previews
        sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets',
                       'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business']
        omit_paid_content = True
-        if 'Front Page' in sectionlist:
+        for page_name,page_title in self.sectionlist:
-            parse_index_page('/home-page','Front Page',omit_paid_content)
+            parse_index_page(page_name,page_title)
-            ans.append('Front Page')
+            ans.append(page_title)
        if 'Commentary' in sectionlist:
            parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content)
            ans.append('Commentary')
        if 'World News' in sectionlist:
            parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content)
            ans.append('World News')
        if 'US News' in sectionlist:
            parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content)
            ans.append('US News')
        if 'Business' in sectionlist:
            parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content)
            ans.append('Business')
        if 'Markets' in sectionlist:
            parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content)
            ans.append('Markets')
        if 'Technology' in sectionlist:
            parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content)
            ans.append('Technology')
        if 'Personal Finance' in sectionlist:
            parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content)
            ans.append('Personal Finance')
        if 'Life & Style' in sectionlist:
            parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content)
            ans.append('Life & Style')
        if 'Real Estate' in sectionlist:
            parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content)
            ans.append('Real Estate')
        if 'Careers' in sectionlist:
            parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content)
            ans.append('Careers')
        if 'Small Business' in sectionlist:
            parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content)
            ans.append('Small Business')
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans